diff --git a/.github/workflows/build-operations-metrics-container.yml b/.github/workflows/build-operations-metrics-container.yml index c52d82db0..c331e70aa 100644 --- a/.github/workflows/build-operations-metrics-container.yml +++ b/.github/workflows/build-operations-metrics-container.yml @@ -9,13 +9,11 @@ on: - main paths: - .github/workflows/build-operations-metrics-container.yml - - 'llvm-ops-metrics/ops-container/**' + - 'premerge/ops-container/**' pull_request: - branches: - - main paths: - .github/workflows/build-operations-metrics-container.yml - - 'llvm-ops-metrics/ops-container/**' + - 'premerge/ops-container/**' jobs: build-operations-metrics-container: @@ -29,7 +27,7 @@ jobs: - name: Checkout LLVM Zorg uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: - sparse-checkout: llvm-ops-metrics/ops-container + sparse-checkout: premerge/ops-container - name: Write Variables id: vars run: | @@ -39,9 +37,9 @@ jobs: echo "container-name-tag=$container_name:$tag" >> $GITHUB_OUTPUT echo "container-filename=$(echo $container_name:$tag | sed -e 's/\//-/g' -e 's/:/-/g').tar" >> $GITHUB_OUTPUT - name: Build Container - working-directory: ./llvm-ops-metrics/ops-container + working-directory: premerge/ops-container run: | - podman build -t ${{ steps.vars.outputs.container-name-tag }} -f Dockerfile . + podman build -t ${{ steps.vars.outputs.container-name-tag }} . # Save the container so we have it in case the push fails. This also # allows us to separate the push step into a different job so we can # maintain minimal permissions while building the container. @@ -55,7 +53,7 @@ jobs: path: ${{ steps.vars.outputs.container-filename }} retention-days: 14 - push-metrics-container: + push-operations-metrics-container: if: github.event_name == 'push' needs: - build-operations-metrics-container @@ -76,4 +74,3 @@ jobs: podman login -u ${{ github.actor }} -p $GITHUB_TOKEN ghcr.io podman push ${{ needs.build-operations-metrics-container.outputs.container-name-tag }} podman push ${{ needs.build-operations-metrics-container.outputs.container-name }}:latest - diff --git a/.github/workflows/build-premerge-buildbot-container.yml b/.github/workflows/build-premerge-buildbot-container.yml new file mode 100644 index 000000000..cfb2a0f49 --- /dev/null +++ b/.github/workflows/build-premerge-buildbot-container.yml @@ -0,0 +1,76 @@ +name: Build Premerge Buildbot Container + +permissions: + contents: read + +on: + push: + branches: + - main + paths: + - .github/workflows/build-premerge-buildbot-container.yml + - 'premerge/buildbot/**' + pull_request: + paths: + - .github/workflows/build-premerge-buildbot-container.yml + - 'premerge/buildbot/**' + +jobs: + build-premerge-buildbot-container: + if: github.repository_owner == 'llvm' + runs-on: ubuntu-24.04 + outputs: + container-name: ${{ steps.vars.outputs.container-name }} + container-name-tag: ${{ steps.vars.outputs.container-name-tag }} + container-filename: ${{ steps.vars.outputs.container-filename }} + steps: + - name: Checkout LLVM Zorg + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + sparse-checkout: premerge/buildbot + - name: Write Variables + id: vars + run: | + tag=`date +%s` + container_name="ghcr.io/$GITHUB_REPOSITORY_OWNER/premerge-buildbot" + echo "container-name=$container_name" >> $GITHUB_OUTPUT + echo "container-name-tag=$container_name:$tag" >> $GITHUB_OUTPUT + echo "container-filename=$(echo $container_name:$tag | sed -e 's/\//-/g' -e 's/:/-/g').tar" >> $GITHUB_OUTPUT + - name: Build Container + working-directory: ./premerge/buildbot + run: | + podman build -t ${{ steps.vars.outputs.container-name-tag }} -f Dockerfile . + # Save the container so we have it in case the push fails. This also + # allows us to separate the push step into a different job so we can + # maintain minimal permissions while building the container. + - name: Save Container Image + run: | + podman save ${{ steps.vars.outputs.container-name-tag }} > ${{ steps.vars.outputs.container-filename }} + - name: Upload Container Image + uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 + with: + name: container + path: ${{ steps.vars.outputs.container-filename }} + retention-days: 14 + + push-premerge-buildbot-container: + if: github.event_name == 'push' + needs: + - build-premerge-buildbot-container + permissions: + packages: write + runs-on: ubuntu-24.04 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + steps: + - name: Download Container Image + uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8 + with: + name: container + - name: Push Container + run: | + podman load -i ${{ needs.build-premerge-buildbot-container.outputs.container-filename }} + podman tag ${{ needs.build-premerge-buildbot-container.outputs.container-name-tag }} ${{ needs.build-premerge-buildbot-container.outputs.container-name }}:latest + podman login -u ${{ github.actor }} -p $GITHUB_TOKEN ghcr.io + podman push ${{ needs.build-premerge-buildbot-container.outputs.container-name-tag }} + podman push ${{ needs.build-premerge-buildbot-container.outputs.container-name }}:latest diff --git a/buildbot/google/scripts/profcheck.sh b/buildbot/google/scripts/profcheck.sh new file mode 100644 index 000000000..9703c69ba --- /dev/null +++ b/buildbot/google/scripts/profcheck.sh @@ -0,0 +1,48 @@ +set -ex + +apt-get update +apt-get install -y python3 python3-pip cmake ninja-build git ccache lsb-release wget software-properties-common gnupg wget +pip3 install --break-system-packages buildbot-worker==3.11.7 + +bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)" 20 +ln -sf /usr/bin/clang-20 /usr/bin/cc +ln -sf /usr/bin/clang++-20 /usr/bin/c++ +ln -sf /usr/bin/ld.lld-20 /usr/bin/ld + +rm -rf /b +BOT_DIR=/b +SERVER_PORT=9994 +WORKER_NAME="$(hostname)" +WORKER_PASSWORD="$(gsutil cat gs://sanitizer-buildbot/$(hostname)-password)" + +userdel buildbot | true +groupadd -f buildbot +useradd buildbot -g buildbot -m -d /b/home +chown buildbot:buildbot $BOT_DIR + +sudo -u buildbot buildbot-worker create-worker -f --allow-shutdown=signal $BOT_DIR lab.llvm.org:$SERVER_PORT \ + "${WORKER_NAME}" "${WORKER_PASSWORD}" + +{ + echo "Mircea Trofin " + echo "Aiden Grossman " +} > $BOT_DIR/info/admin + +{ + echo "To reproduce locally, use a standard CMake invocation with -DLLVM_ENABLE_PROFCHECK=ON and -DLLVM_LIT_ARGS='--exclude-xfail'" + echo "Example:" + echo "cmake -GNinja" + echo " -DCMAKE_BUILD_TYPE=Release" + echo " -DLLVM_ENABLE_ASSERTIONS=ON" + echo " -DLLVM_LIT_ARGS='--exclude-xfail'" + echo " -DLLVM_ENABLE_PROFCHECK=ON" + echo + uname -a | head -n1 + date + cmake --version | head -n1 + c++ --version | head -n1 + ld --version | head -n1 + lscpu +} > $BOT_DIR/info/host + +sudo -u buildbot buildbot-worker start $BOT_DIR diff --git a/buildbot/osuosl/master/config/builders.py b/buildbot/osuosl/master/config/builders.py index b92c48e76..28e706a80 100644 --- a/buildbot/osuosl/master/config/builders.py +++ b/buildbot/osuosl/master/config/builders.py @@ -18,6 +18,7 @@ from zorg.buildbot.builders import XToolchainBuilder from zorg.buildbot.builders import TestSuiteBuilder from zorg.buildbot.builders import BOLTBuilder +from zorg.buildbot.builders import DebugifyBuilder from zorg.buildbot.builders import HtmlDocsBuilder from zorg.buildbot.builders import DoxygenDocsBuilder @@ -46,6 +47,11 @@ reload(StagedBuilder) +# Doxygen build takes a really long time. We want to collapse build requests +# more aggressively to better keep up with the changes. +def collapseRequestsDoxygen(master, builder, req1, req2): + return req1.get('reason', None) == req2.get('reason', None) + all = [ # Clang fast builders. @@ -344,21 +350,6 @@ checkout_lld=False, extra_cmake_args=["-DLLVM_TARGETS_TO_BUILD='ARM'"])}, - # ARMv7 LNT test-suite in test-only mode - {'name' : "clang-armv7-lnt", - 'tags' : ["clang"], - 'workernames' : ["linaro-clang-armv7-lnt"], - 'builddir': "clang-armv7-lnt", - 'factory' : ClangBuilder.getClangCMakeBuildFactory( - clean=False, - checkout_compiler_rt=False, - checkout_lld=False, - checks=[], - runTestSuite=True, - testsuite_flags=[ - '--cppflags', '-mcpu=cortex-a15 -marm', - '--threads=32', '--build-threads=32'])}, - ## ARMv7 check-all 2-stage {'name' : "clang-armv7-2stage", 'tags' : ["clang"], @@ -369,7 +360,11 @@ checkout_compiler_rt=False, checkout_lld=False, useTwoStage=True, - testStage1=False, + testStage1=True, + runTestSuite=True, + testsuite_flags=[ + '--cppflags', '-mcpu=cortex-a15 -marm', + '--threads=32', '--build-threads=32'], extra_cmake_args=[ "-DCMAKE_C_FLAGS='-mcpu=cortex-a15 -marm'", "-DCMAKE_CXX_FLAGS='-mcpu=cortex-a15 -marm'"])}, @@ -414,22 +409,33 @@ checkout_lld=False, extra_cmake_args=["-DLLVM_TARGETS_TO_BUILD='AArch64'"])}, - ## AArch64 check-all + LLD + test-suite 2-stage + # AArch64 2 stage build with lld, flang, compiler-rt, test-suite and SVE/SME + # mlir integration tests. {'name' : "clang-aarch64-lld-2stage", 'tags' : ["lld"], 'workernames' : ["linaro-clang-aarch64-lld-2stage"], 'builddir':"clang-aarch64-lld-2stage", 'factory' : ClangBuilder.getClangCMakeBuildFactory( clean=True, + checkout_flang=True, + checkout_lld=True, useTwoStage=True, runTestSuite=True, + env={ + 'NO_STOP_MESSAGE':'1', # For Fortran test-suite + }, testsuite_flags=[ '--cppflags', '-mcpu=neoverse-n1 -fuse-ld=lld', '--threads=32', '--build-threads=32'], extra_cmake_args=[ "-DCMAKE_C_FLAGS='-mcpu=neoverse-n1'", "-DCMAKE_CXX_FLAGS='-mcpu=neoverse-n1'", - "-DLLVM_ENABLE_LLD=True"])}, + "-DLLVM_ENABLE_LLD=True", + "-DLLVM_LIT_ARGS='-v'", + "-DMLIR_INCLUDE_INTEGRATION_TESTS=True", + "-DMLIR_RUN_ARM_SVE_TESTS=True", + "-DMLIR_RUN_ARM_SME_TESTS=True", + "-DARM_EMULATOR_EXECUTABLE=qemu-aarch64"])}, ## AArch64 run test-suite at -O0 (GlobalISel is now default). {'name' : "clang-aarch64-global-isel", @@ -466,44 +472,12 @@ # lld tests cause us to hit thread limits "-DLLVM_ENABLE_THREADS=OFF"])}, - # AArch64 check-all + flang + compiler-rt + test-suite + SVE/SME - # mlir-integration-tests 2-stage - {'name' : "clang-aarch64-full-2stage", - 'tags' : ["clang"], - 'workernames' : ["linaro-clang-aarch64-full-2stage"], - 'builddir': "clang-aarch64-full-2stage", - 'factory' : ClangBuilder.getClangCMakeBuildFactory( - clean=True, - checkout_flang=True, - checkout_lld=True, - useTwoStage=True, - testStage1=False, - runTestSuite=True, - env={ - 'NO_STOP_MESSAGE':'1', # For Fortran test-suite - }, - testsuite_flags=[ - '--cppflags', '-mcpu=neoverse-n1', - '--threads=32', '--build-threads=32'], - extra_cmake_args=[ - "-DCMAKE_C_FLAGS='-mcpu=neoverse-n1'", - "-DCMAKE_CXX_FLAGS='-mcpu=neoverse-n1'", - "-DLLVM_LIT_ARGS='-v'", - "-DMLIR_INCLUDE_INTEGRATION_TESTS=True", - "-DMLIR_RUN_ARM_SVE_TESTS=True", - "-DMLIR_RUN_ARM_SME_TESTS=True", - "-DARM_EMULATOR_EXECUTABLE=qemu-aarch64"])}, - # All SVE (as opposed to SVE2) builders are using optimisation flags # for Graviton 3 "balanced" from # https://github.com/aws/aws-graviton-getting-started/blob/main/c-c++.md. # AArch64 Clang+LLVM+RT+LLD check-all + flang + test-suite + - # mlir-integration-tests w/SVE-Vector-Length-Agnostic Note that in this and - # other clang-aarch64-sve-* builders we set -mllvm - # -treat-scalable-fixed-error-as-warning=false to make compiler fail on - # non-critical SVE codegen issues. This helps us notice and fix SVE - # problems sooner rather than later. + # mlir-integration-tests w/SVE-Vector-Length-Agnostic {'name' : "clang-aarch64-sve-vla", 'tags' : ["clang"], 'workernames' : ["linaro-g3-01", "linaro-g3-02", "linaro-g3-03", "linaro-g3-04"], @@ -516,8 +490,8 @@ 'NO_STOP_MESSAGE':'1', # For Fortran test-suite }, testsuite_flags=[ - '--cppflags', '-mcpu=neoverse-512tvb -mllvm -scalable-vectorization=preferred -mllvm -treat-scalable-fixed-error-as-warning=false -O3', - '--cmake-define', 'CMAKE_Fortran_FLAGS=-mcpu=neoverse-512tvb -mllvm -scalable-vectorization=preferred -mllvm -treat-scalable-fixed-error-as-warning=false -O3', + '--cppflags', '-mcpu=neoverse-512tvb -mllvm -scalable-vectorization=preferred -O3', + '--cmake-define', 'CMAKE_Fortran_FLAGS=-mcpu=neoverse-512tvb -mllvm -scalable-vectorization=preferred -O3', '--threads=32', '--build-threads=32'], extra_cmake_args=[ "-DCMAKE_C_FLAGS='-mcpu=neoverse-512tvb'", @@ -541,12 +515,12 @@ 'NO_STOP_MESSAGE':'1', # For Fortran test-suite }, testsuite_flags=[ - '--cppflags', '-mcpu=neoverse-512tvb -mllvm -scalable-vectorization=preferred -mllvm -treat-scalable-fixed-error-as-warning=false -O3', - '--cmake-define', 'CMAKE_Fortran_FLAGS=-mcpu=neoverse-512tvb -mllvm -scalable-vectorization=preferred -mllvm -treat-scalable-fixed-error-as-warning=false -O3', + '--cppflags', '-mcpu=neoverse-512tvb -mllvm -scalable-vectorization=preferred -O3', + '--cmake-define', 'CMAKE_Fortran_FLAGS=-mcpu=neoverse-512tvb -mllvm -scalable-vectorization=preferred -O3', '--threads=32', '--build-threads=32'], extra_cmake_args=[ - "-DCMAKE_C_FLAGS='-mcpu=neoverse-512tvb -mllvm -scalable-vectorization=preferred -mllvm -treat-scalable-fixed-error-as-warning=false'", - "-DCMAKE_CXX_FLAGS='-mcpu=neoverse-512tvb -mllvm -scalable-vectorization=preferred -mllvm -treat-scalable-fixed-error-as-warning=false'", + "-DCMAKE_C_FLAGS='-mcpu=neoverse-512tvb -mllvm -scalable-vectorization=preferred'", + "-DCMAKE_CXX_FLAGS='-mcpu=neoverse-512tvb -mllvm -scalable-vectorization=preferred'", "-DLLVM_ENABLE_LLD=True", "-DMLIR_INCLUDE_INTEGRATION_TESTS=True", "-DMLIR_RUN_ARM_SVE_TESTS=True"])}, @@ -564,8 +538,8 @@ 'NO_STOP_MESSAGE':'1', # For Fortran test-suite }, testsuite_flags=[ - '--cppflags', '-mcpu=neoverse-512tvb -msve-vector-bits=256 -mllvm -treat-scalable-fixed-error-as-warning=false -O3', - '--cmake-define', 'CMAKE_Fortran_FLAGS=-mcpu=neoverse-512tvb -msve-vector-bits=256 -mllvm -treat-scalable-fixed-error-as-warning=false -O3', + '--cppflags', '-mcpu=neoverse-512tvb -msve-vector-bits=256 -O3', + '--cmake-define', 'CMAKE_Fortran_FLAGS=-mcpu=neoverse-512tvb -msve-vector-bits=256 -O3', '--threads=32', '--build-threads=32'], extra_cmake_args=[ "-DCMAKE_C_FLAGS='-mcpu=neoverse-512tvb'", @@ -589,12 +563,12 @@ 'NO_STOP_MESSAGE':'1', # For Fortran test-suite }, testsuite_flags=[ - '--cppflags', '-mcpu=neoverse-512tvb -msve-vector-bits=256 -mllvm -treat-scalable-fixed-error-as-warning=false -O3', - '--cmake-define', 'CMAKE_Fortran_FLAGS=-mcpu=neoverse-512tvb -msve-vector-bits=256 -mllvm -treat-scalable-fixed-error-as-warning=false -O3', + '--cppflags', '-mcpu=neoverse-512tvb -msve-vector-bits=256 -O3', + '--cmake-define', 'CMAKE_Fortran_FLAGS=-mcpu=neoverse-512tvb -msve-vector-bits=256 -O3', '--threads=32', '--build-threads=32'], extra_cmake_args=[ - "-DCMAKE_C_FLAGS='-mcpu=neoverse-512tvb -msve-vector-bits=256 -mllvm -treat-scalable-fixed-error-as-warning=false'", - "-DCMAKE_CXX_FLAGS='-mcpu=neoverse-512tvb -msve-vector-bits=256 -mllvm -treat-scalable-fixed-error-as-warning=false'", + "-DCMAKE_C_FLAGS='-mcpu=neoverse-512tvb -msve-vector-bits=256'", + "-DCMAKE_CXX_FLAGS='-mcpu=neoverse-512tvb -msve-vector-bits=256'", "-DLLVM_ENABLE_LLD=True", "-DMLIR_INCLUDE_INTEGRATION_TESTS=True", "-DMLIR_RUN_ARM_SVE_TESTS=True"])}, @@ -615,8 +589,8 @@ 'NO_STOP_MESSAGE':'1', # For Fortran test-suite }, testsuite_flags=[ - '--cppflags', '-mcpu=neoverse-v2 -mllvm -scalable-vectorization=preferred -mllvm -treat-scalable-fixed-error-as-warning=false -O3', - '--cmake-define', 'CMAKE_Fortran_FLAGS=-mcpu=neoverse-v2 -mllvm -scalable-vectorization=preferred -mllvm -treat-scalable-fixed-error-as-warning=false -O3', + '--cppflags', '-mcpu=neoverse-v2 -mllvm -scalable-vectorization=preferred -O3', + '--cmake-define', 'CMAKE_Fortran_FLAGS=-mcpu=neoverse-v2 -mllvm -scalable-vectorization=preferred -O3', '--threads=48', '--build-threads=48'], extra_cmake_args=[ "-DCMAKE_C_FLAGS='-mcpu=neoverse-v2'", @@ -641,12 +615,12 @@ 'NO_STOP_MESSAGE':'1', # For Fortran test-suite }, testsuite_flags=[ - '--cppflags', '-mcpu=neoverse-v2 -mllvm -scalable-vectorization=preferred -mllvm -treat-scalable-fixed-error-as-warning=false -O3', - '--cmake-define', 'CMAKE_Fortran_FLAGS=-mcpu=neoverse-v2 -mllvm -scalable-vectorization=preferred -mllvm -treat-scalable-fixed-error-as-warning=false -O3', + '--cppflags', '-mcpu=neoverse-v2 -mllvm -scalable-vectorization=preferred -O3', + '--cmake-define', 'CMAKE_Fortran_FLAGS=-mcpu=neoverse-v2 -mllvm -scalable-vectorization=preferred -O3', '--threads=48', '--build-threads=48'], extra_cmake_args=[ - "-DCMAKE_C_FLAGS='-mcpu=neoverse-v2 -mllvm -scalable-vectorization=preferred -mllvm -treat-scalable-fixed-error-as-warning=false'", - "-DCMAKE_CXX_FLAGS='-mcpu=neoverse-v2 -mllvm -scalable-vectorization=preferred -mllvm -treat-scalable-fixed-error-as-warning=false'", + "-DCMAKE_C_FLAGS='-mcpu=neoverse-v2 -mllvm -scalable-vectorization=preferred'", + "-DCMAKE_CXX_FLAGS='-mcpu=neoverse-v2 -mllvm -scalable-vectorization=preferred'", "-DLLVM_ENABLE_LLD=True", "-DMLIR_INCLUDE_INTEGRATION_TESTS=True", "-DMLIR_RUN_ARM_SVE_TESTS=True"])}, @@ -846,6 +820,7 @@ 'builddir': 'clang-sparc64-linux', 'factory' : ClangBuilder.getClangCMakeBuildFactory( clean=False, + timeout=1800, runTestSuite=True, checkout_clang_tools_extra=False, checkout_compiler_rt=False, @@ -1066,24 +1041,27 @@ "-DLLVM_USE_LINKER=gold", "-DLLVM_ENABLE_WERROR=OFF"])}, - {'name': "llvm-clang-key-instructions", - 'tags' : ["llvm", "clang", "compiler-rt", "lld", "cross-project-tests"], - 'workernames': ["sie-linux-worker5"], - 'builddir': "llvm-ki", - 'factory': UnifiedTreeBuilder.getCmakeWithNinjaBuildFactory( - depends_on_projects=['llvm','clang','compiler-rt','lld','cross-project-tests'], + {'name': "llvm-x86_64-debugify-coverage", + 'tags': ["llvm", "clang", "lld"], + 'workernames': ["sie-linux-worker5"], + 'builddir': "llvm-dbg", + 'factory': DebugifyBuilder.getDebugifyBuildFactory( + clean=True, + depends_on_projects=['llvm','clang','lld'], extra_configure_args=[ - "-DCMAKE_C_COMPILER=gcc", - "-DCMAKE_CXX_COMPILER=g++", - "-DCMAKE_BUILD_TYPE=Release", - "-DCLANG_ENABLE_CLANGD=OFF", - "-DLLVM_BUILD_RUNTIME=ON", + "-DCMAKE_C_COMPILER=clang", + "-DCMAKE_CXX_COMPILER=clang++", + "-DCMAKE_BUILD_TYPE=RelWithDebInfo", + "-DCMAKE_C_FLAGS_RELWITHDEBINFO=-O2 -gmlt -DNDEBUG", + "-DCMAKE_CXX_FLAGS_RELWITHDEBINFO=-O2 -gmlt -DNDEBUG", + "-DLLVM_CCACHE_BUILD=ON", "-DLLVM_BUILD_TESTS=ON", "-DLLVM_ENABLE_ASSERTIONS=ON", - "-DLLVM_EXPERIMENTAL_KEY_INSTRUCTIONS=ON", "-DLLVM_INCLUDE_EXAMPLES=OFF", - "-DLLVM_LIT_ARGS=--verbose --timeout=900", - "-DLLVM_USE_LINKER=gold"])}, + "-DLLVM_TARGETS_TO_BUILD=X86", + "-DLLVM_LIT_ARGS=-v", + "-DLLVM_USE_LINKER=lld", + "-DLLVM_ENABLE_WERROR=OFF"])}, {'name': "llvm-clang-x86_64-darwin", 'tags' : ["llvm", "clang", "clang-tools-extra", "lld", "cross-project-tests"], @@ -1852,9 +1830,21 @@ 'tags' : ["sanitizer"], 'workernames' : ["sanitizer-windows"], 'builddir': "sanitizer-windows", - 'factory' : AnnotatedBuilder.getAnnotatedBuildFactory( - script="sanitizer-windows.py", + 'factory': UnifiedTreeBuilder.getCmakeWithNinjaWithMSVCBuildFactory( + vs="autodetect", + clean=True, depends_on_projects=["llvm", "clang", "lld", "compiler-rt"], + target_arch='x64', + # TODO(boomanaiden154): We should probably be using sccache here. + extra_configure_args=[ + "-DCMAKE_BUILD_TYPE=Release", + "-DLLVM_ENABLE_ASSERTIONS=ON", + "-DLLVM_ENABLE_PDB=ON", + "-DLLVM_TARGETS_TO_BUILD=X86", + "-DCOMPILER_RT_BUILD_BUILTINS=ON", + "-DCOMPILER_RT_BUILD_ORC=OFF", + ], + checks=["check-compiler-rt"], # FIXME: Restore `timeout` to default when fixed https://github.com/llvm/llvm-project/issues/102513 timeout=2400)}, @@ -1977,6 +1967,8 @@ "-DLLVM_ENABLE_ASSERTIONS=ON", "-DCMAKE_C_COMPILER_LAUNCHER=ccache", "-DCMAKE_CXX_COMPILER_LAUNCHER=ccache", + "-DRUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES='compiler-rt;openmp'", + "-DLLVM_RUNTIME_TARGETS=default;amdgcn-amd-amdhsa", ], env={ 'HSA_ENABLE_SDMA':'0', @@ -2094,37 +2086,13 @@ 'workernames' : ["omp-vega20-1"], # We would like to never collapse, but it seems the load is too high on that system to keep up. 'builddir': "openmp-offload-libc-amdgpu-runtime", - 'factory' : OpenMPBuilder.getOpenMPCMakeBuildFactory( - clean=True, - depends_on_projects=['llvm', 'clang', 'compiler-rt', 'libc', 'lld', 'offload', 'openmp'], - # Special case this bot to account for new (verbose) libc build syntax - enable_runtimes=['openmp', 'compiler-rt', 'offload'], - extraCmakeArgs=[ - "-DCMAKE_BUILD_TYPE=Release", - "-DCLANG_DEFAULT_LINKER=lld", - "-DLLVM_TARGETS_TO_BUILD=X86;AMDGPU", - "-DLLVM_ENABLE_ASSERTIONS=ON", - "-DCMAKE_C_COMPILER_LAUNCHER=ccache", - "-DCMAKE_CXX_COMPILER_LAUNCHER=ccache", - "-DLIBOMPTARGET_FOUND_AMDGPU_GPU=ON", - "-DLIBOMP_ARCHER_SUPPORT=OFF", - "-DRUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES=libc", - "-DLLVM_RUNTIME_TARGETS=default;amdgcn-amd-amdhsa", - "-DRUNTIMES_amdgcn-amd-amdhsa_LIBC_GPU_TEST_ARCHITECTURE=gfx906", - ], - env={ - 'HSA_ENABLE_SDMA':'0', - }, - install=True, - testsuite=False, - testsuite_sollvevv=False, - extraTestsuiteCmakeArgs=[ - "-DTEST_SUITE_SOLLVEVV_OFFLOADING_CFLAGS=-fopenmp;-fopenmp-targets=amdgcn-amd-amdhsa;-Xopenmp-target=amdgcn-amd-amdhsa;-march=gfx906", - "-DTEST_SUITE_SOLLVEVV_OFFLOADING_LDLAGS=-fopenmp;-fopenmp-targets=amdgcn-amd-amdhsa;-Xopenmp-target=amdgcn-amd-amdhsa;-march=gfx906", - ], - add_lit_checks=["check-offload", "check-libc-amdgcn-amd-amdhsa"], - add_openmp_lit_args=["--filter-out=offloading/pgo1.c"], - )}, + 'factory' : AnnotatedBuilder.getAnnotatedBuildFactory( + depends_on_projects=['llvm', 'clang', 'compiler-rt', 'lld', 'libc', 'libcxx', 'libcxxabi', 'offload', 'openmp', 'libunwind'], + script='amdgpu-offload-cmake.py', + extra_args=['--cmake-file=AMDGPULibcBot.cmake'], + checkout_llvm_sources=True, + script_interpreter=None + )}, {'name' : "openmp-offload-amdgpu-clang-flang", 'tags' : ["openmp,flang"], @@ -2146,6 +2114,8 @@ "-DCMAKE_CXX_STANDARD=17", "-DBUILD_SHARED_LIBS=ON", "-DLIBOMPTARGET_PLUGINS_TO_BUILD=amdgpu;host", + "-DRUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES='compiler-rt;openmp'", + "-DLLVM_RUNTIME_TARGETS=default;amdgcn-amd-amdhsa", "-DCOMPILER_RT_BUILD_ORC=OFF", "-DCOMPILER_RT_BUILD_XRAY=OFF", "-DCOMPILER_RT_BUILD_MEMPROF=OFF", @@ -2321,15 +2291,6 @@ depends_on_projects=['llvm', 'libc', 'clang', 'clang-tools-extra'], extra_args=['--debug'])}, - {'name' : 'libc-arm32-debian-dbg', - 'tags' : ["libc"], - 'workernames' : ['libc-arm32-debian'], - 'builddir': 'libc-arm32-debian-dbg', - 'factory' : AnnotatedBuilder.getAnnotatedBuildFactory( - script="libc-linux.py", - depends_on_projects=['llvm', 'libc', 'clang', 'clang-tools-extra'], - extra_args=['--debug'])}, - {'name' : 'libc-arm32-qemu-debian-dbg', 'tags' : ["libc"], 'workernames' : ['libc-arm32-qemu-debian'], @@ -2672,6 +2633,29 @@ "-DCOMPILER_RT_BUILD_SANITIZERS=OFF", "-DLLVM_CCACHE_BUILD=ON"])}, + {'name' : "flang-arm64-windows-msvc-testsuite", + 'tags' : ["flang"], + 'workernames' : ["linaro-armv8-windows-msvc-06"], + 'builddir': "flang-arm64-win-msvc-ts", + 'factory' : ClangBuilder.getClangCMakeBuildFactory( + vs="manual", + clean=False, + checkout_flang=True, + checkout_lld=True, + runTestSuite=True, + testStage1=False, + testsuite_flags=[ + '--cmake-define', "TEST_SUITE_SUBDIRS='Fortran'", + '--use-make=ninja', + '--threads=8', + '--build-threads=8'], + extra_cmake_args=[ + "-DLLVM_TARGETS_TO_BUILD=AArch64", + "-DCLANG_DEFAULT_LINKER=lld", + "-DCMAKE_TRY_COMPILE_CONFIGURATION=Release", + "-DCOMPILER_RT_BUILD_SANITIZERS=OFF", + "-DLLVM_CCACHE_BUILD=ON"])}, + {'name' : 'ppc64-flang-aix', 'tags' : ["flang", "ppc", "ppc64", "aix"], 'workernames' : ['ppc64-flang-aix-test'], @@ -2744,8 +2728,9 @@ {'name' : "publish-doxygen-docs", 'tags' : ["doc"], - 'workernames' : ["as-worker-4"], #FIXME: Temporarily disabled failing doxygen build - as-builder-8. + 'workernames' : ["as-worker-4"], 'builddir': "publish-doxygen-docs", + 'collapseRequests': collapseRequestsDoxygen, 'factory' : DoxygenDocsBuilder.getLLVMDocsBuildFactory( # Doxygen builds the final result for really # long time without any output. @@ -3571,51 +3556,90 @@ checkout_llvm_sources=False, script_interpreter=None, clean=True)}, - - # Builders similar to used in Buildkite premerge pipeline. - # Please keep in sync with llvm-project/.ci configurations. - - # See https://github.com/llvm/llvm-project/blob/main/.ci/monolithic-windows.sh. - {'name' : "premerge-monolithic-windows", - 'tags' : ["premerge"], - 'workernames' : ["premerge-windows-1"], - 'builddir': "premerge-monolithic-windows", - 'factory' : UnifiedTreeBuilder.getCmakeWithNinjaWithMSVCBuildFactory( - vs="autodetect", - depends_on_projects=["clang-tools-extra", "clang", "libclc", "lld", "llvm", "mlir", "polly"], - checks=["check-all"], - install_pip_requirements = True, - clean = True, - extra_configure_args=[ - "-DCMAKE_BUILD_TYPE=Release", - "-DLLVM_ENABLE_ASSERTIONS=ON", - "-DLLVM_BUILD_EXAMPLES=ON", - "-DCOMPILER_RT_BUILD_LIBFUZZER=OFF", - "-DLLVM_LIT_ARGS=-v", - "-DMLIR_ENABLE_BINDINGS_PYTHON=ON", - "-DCOMPILER_RT_BUILD_ORC=OFF", - "-DCMAKE_C_COMPILER_LAUNCHER=sccache", - "-DCMAKE_CXX_COMPILER_LAUNCHER=sccache"])}, - # See https://github.com/llvm/llvm-project/blob/main/.ci/monolithic-linux.sh. - {'name': "premerge-monolithic-linux", - 'tags' : ["premerge"], - 'collapseRequests': False, - 'workernames': ["premerge-linux-1"], - 'builddir': "premerge-monolithic-linux", - 'factory': UnifiedTreeBuilder.getCmakeWithNinjaBuildFactory( - depends_on_projects=["bolt", "clang", "clang-tools-extra", "compiler-rt", "flang", "flang-rt", "libc", "libclc", "lld", "llvm", "mlir", "polly"], - install_pip_requirements = True, - extra_configure_args=[ - "-DCMAKE_BUILD_TYPE=Release", - "-DLLVM_ENABLE_ASSERTIONS=ON", - "-DLLVM_BUILD_EXAMPLES=ON", - "-DCOMPILER_RT_BUILD_LIBFUZZER=OFF", - "-DMLIR_ENABLE_BINDINGS_PYTHON=ON", - "-DLLVM_LIT_ARGS=-v", - "-DLLVM_ENABLE_LLD=ON", - "-DCMAKE_CXX_FLAGS=-gmlt", - "-DLLVM_CCACHE_BUILD=ON"])}, - + + # Builders that test the premerge configuration + # These builders are specifically for running the premerge configuration + # postcommit (after changes have landed in main). The configuration for + # running these checks premerge exists in the monorepo inside the + # .github/workflows/premerge.yaml file. + { + "name": "premerge-monolithic-linux", + "workernames": [ + "premerge-us-central-linux-b1", + "premerge-us-central-linux-b2", + "premerge-us-central-linux-b3", + "premerge-us-west-linux-b1", + "premerge-us-west-linux-b2", + "premerge-us-west-linux-b3", + ], + "collapseRequests": False, + "builddir": "premerge-monolithic-linux", + "factory": AnnotatedBuilder.getAnnotatedBuildFactory( + script="premerge/dispatch_job.py", + checkout_llvm_sources=False, + extra_args=["Linux"], + depends_on_projects=[ + "bolt", + "clang", + "clang-tools-extra", + "compiler-rt", + "flang", + "flang-rt", + "libc", + "libclc", + "lld", + "lldb", + "llvm", + "mlir", + "polly", + "libunwind", + "libcxx", + "libcxxabi", + ], + ), + }, + { + "name": "premerge-monolithic-windows", + "collapseRequests": False, + "workernames": [ + "premerge-us-central-windows-b1", + "premerge-us-central-windows-b2", + "premerge-us-central-windows-b3", + "premerge-us-west-windows-b1", + "premerge-us-west-windows-b2", + "premerge-us-west-windows-b3", + ], + "builddir": "premerge-monolithic-windows", + "factory": AnnotatedBuilder.getAnnotatedBuildFactory( + script="premerge/dispatch_job.py", + checkout_llvm_sources=False, + extra_args=["Windows"], + depends_on_projects=[ + "clang-tools-extra", + "clang", + "libclc", + "lld", + "llvm", + "mlir", + "polly", + ], + ), + }, + # Builders for the profcheck configuration + # These workers run builds with LLVM_ENABLE_PROFCHECK=ON to ensure + # that profile information is propagated correctly. + { + "name": "profcheck", + "workernames": ["profcheck-b1", "profcheck-b2"], + "collapseRequests": False, + "builddir": "profcheck-build", + "factory": AnnotatedBuilder.getAnnotatedBuildFactory( + script="profcheck.sh", + clean=True, + depends_on_projects=["llvm"], + script_interpreter=None, + ), + }, ] # LLDB remote-linux builder env variables. @@ -3640,7 +3664,20 @@ 'workernames': ["as-builder-9"], 'builddir': "lldb-remote-linux-ubuntu", 'factory': UnifiedTreeBuilder.getCmakeExBuildFactory( - depends_on_projects = ["llvm", "clang", "lld", "lldb"], + depends_on_projects = [ + 'llvm', + 'compiler-rt', + 'clang', + 'libunwind', + 'libcxx', + 'libcxxabi', + 'lld', + 'lldb', + ], + # Allow only these projects with LLVM_ENABLE_PROJECTS. + enable_projects = ["llvm", "clang", "lld", "lldb"], + # Use a proper list of runtimes (LLVM_ENABLE_RUNTIMES) from CrossWinToARMLinux.cmake. + # Avoid making it from a list of the depended projects. enable_runtimes = None, checks = [ "check-lldb-unit", @@ -3678,9 +3715,6 @@ "LLDB_ENABLE_CURSES" : "OFF", "LLDB_ENABLE_LZMA" : "OFF", "LLDB_ENABLE_LIBXML2" : "OFF", - # No need to build lldb-server during the first stage. - # We are going to build it for the target platform later. - "LLDB_CAN_USE_LLDB_SERVER" : "OFF", "LLDB_TEST_USER_ARGS" : util.Interpolate( "--env;ARCH_CFLAGS=-mcpu=cortex-a78;" \ "--platform-name;remote-linux;" \ @@ -3769,7 +3803,20 @@ 'workernames': ["as-builder-10"], 'builddir': "lldb-x-aarch64", 'factory': UnifiedTreeBuilder.getCmakeExBuildFactory( - depends_on_projects = ["llvm", "clang", "lld", "lldb"], + depends_on_projects = [ + 'llvm', + 'compiler-rt', + 'clang', + 'libunwind', + 'libcxx', + 'libcxxabi', + 'lld', + 'lldb', + ], + # Allow only these projects with LLVM_ENABLE_PROJECTS. + enable_projects = ["llvm", "clang", "lld", "lldb"], + # Use a proper list of runtimes (LLVM_ENABLE_RUNTIMES) from CrossWinToARMLinux.cmake. + # Avoid making it from a list of the depended projects. enable_runtimes = None, checks = [ "check-lldb-unit", @@ -3806,9 +3853,6 @@ "LLDB_ENABLE_CURSES" : "OFF", "LLDB_ENABLE_LZMA" : "OFF", "LLDB_ENABLE_LIBXML2" : "OFF", - # No need to build lldb-server during the first stage. - # We are going to build it for the target platform later. - "LLDB_CAN_USE_LLDB_SERVER" : "OFF", "LLDB_TEST_USER_ARGS" : util.Interpolate( "--env;ARCH_CFLAGS=-mcpu=cortex-a78;" \ "--platform-name;remote-linux;" \ diff --git a/buildbot/osuosl/master/config/status.py b/buildbot/osuosl/master/config/status.py index fa5c530a6..ffe40d0e4 100644 --- a/buildbot/osuosl/master/config/status.py +++ b/buildbot/osuosl/master/config/status.py @@ -242,7 +242,7 @@ def getReporters(): reporters.MailNotifier( fromaddr = status_email_fromaddr, sendToInterestedUsers = False, - extraRecipients = ["labath@google.com"], + extraRecipients = ["labath@google.com", "cmtice@google.com"], generators = [ utils.LLVMDefaultBuildStatusGenerator( builders = ["lldb-x86_64-debian"]) @@ -290,7 +290,6 @@ def getReporters(): builders = [ "libc-aarch64-ubuntu-dbg", "libc-aarch64-ubuntu-fullbuild-dbg", - "libc-arm32-debian-dbg", "libc-arm32-qemu-debian-dbg", "libc-riscv64-debian-dbg", "libc-riscv64-debian-fullbuild-dbg", @@ -343,7 +342,8 @@ def getReporters(): reporters.MailNotifier( fromaddr = status_email_fromaddr, sendToInterestedUsers = False, - extraRecipients = ["douglas.yung@sony.com"], + extraRecipients = ["douglas.yung@sony.com", + "douglasyung.llvm@gmail.com" ], generators = [ utils.LLVMDefaultBuildStatusGenerator( builders = [ @@ -360,7 +360,7 @@ def getReporters(): "llvm-clang-x86_64-darwin", "llvm-clang-aarch64-darwin", "llvm-clang-aarch64-darwin-release", - "llvm-clang-key-instructions"]) + "llvm-x86_64-debugify-coverage"]) ]), reporters.MailNotifier( fromaddr = status_email_fromaddr, @@ -485,8 +485,7 @@ def getReporters(): utils.LLVMDefaultBuildStatusGenerator( builders = [ "cross-project-tests-sie-ubuntu", - "llvm-clang-x86_64-sie-win", - "llvm-clang-key-instructions"]) + "llvm-clang-x86_64-sie-win"]) ]), reporters.MailNotifier( fromaddr = status_email_fromaddr, @@ -553,22 +552,47 @@ def getReporters(): reporters.MailNotifier( fromaddr = status_email_fromaddr, sendToInterestedUsers = False, - extraRecipients = ["llvm-premerge-buildbots@google.com", "joker.eph@gmail.com"], + extraRecipients = ["szakharin@nvidia.com"], generators = [ utils.LLVMDefaultBuildStatusGenerator( builders = [ - "premerge-monolithic-windows", - "premerge-monolithic-linux"]) + "flang-runtime-cuda-gcc", + "flang-runtime-cuda-clang"]) ]), reporters.MailNotifier( fromaddr = status_email_fromaddr, sendToInterestedUsers = False, - extraRecipients = ["szakharin@nvidia.com"], + extraRecipients = ["stephen.tozer@sony.com"], generators = [ utils.LLVMDefaultBuildStatusGenerator( builders = [ - "flang-runtime-cuda-gcc", - "flang-runtime-cuda-clang"]) + "llvm-x86_64-debugify-coverage"]) + ]), + reporters.MailNotifier( + fromaddr = status_email_fromaddr, + sendToInterestedUsers = False, + extraRecipients = ["profcheck-buildbot@google.com"], + generators = [ + utils.LLVMDefaultBuildStatusGenerator( + builders = [ + "profcheck"]) + ]), + reporters.MailNotifier( + fromaddr=status_email_fromaddr, + sendToInterestedUsers=False, + extraRecipients=[ + "llvm-presubmit-infra@google.com", + "aidengrossman@google.com", + "cmtice@google.com", + ], + generators=[ + utils.LLVMDefaultBuildStatusGenerator( + subject="Premerge Buildbot Failure: {{ buildername }}", + builders=[ + "premerge-monolithic-linux", + "premerge-monolithic-windows", + ], + ) ]), ]) diff --git a/buildbot/osuosl/master/config/workers.py b/buildbot/osuosl/master/config/workers.py index 284ec3bd3..644fb82d8 100644 --- a/buildbot/osuosl/master/config/workers.py +++ b/buildbot/osuosl/master/config/workers.py @@ -16,7 +16,6 @@ def get_all(): create_worker("as-worker-4", properties={'jobs' : 24}, max_builds=2), # ARMv7/ARMv8 Linaro workers - create_worker("linaro-clang-armv7-lnt", max_builds=1), create_worker("linaro-clang-armv7-2stage", max_builds=1), create_worker("linaro-clang-armv7-global-isel", max_builds=1), create_worker("linaro-clang-armv7-vfpv3-2stage", max_builds=1), @@ -28,7 +27,6 @@ def get_all(): create_worker("linaro-clang-aarch64-quick", max_builds=1), create_worker("linaro-clang-aarch64-lld-2stage", max_builds=1), create_worker("linaro-clang-aarch64-global-isel", max_builds=1), - create_worker("linaro-clang-aarch64-full-2stage", max_builds=1), create_worker("linaro-lldb-aarch64-ubuntu", max_builds=1), create_worker("linaro-flang-aarch64-dylib", max_builds=1), create_worker("linaro-flang-aarch64-sharedlibs", max_builds=1), @@ -53,6 +51,7 @@ def get_all(): create_worker("linaro-armv8-windows-msvc-03", max_builds=1), create_worker("linaro-armv8-windows-msvc-04", max_builds=1), create_worker("linaro-armv8-windows-msvc-05", max_builds=1), + create_worker("linaro-armv8-windows-msvc-06", max_builds=1), # Linux s390x Ubuntu Focal, IBM z13 (5GHz), 64GB of RAM create_worker("onnx-mlir-nowarn-linux-s390x", properties={'jobs' : 4}, max_builds=1), @@ -161,9 +160,6 @@ def get_all(): # Windows x86_64 32 CPUs, 125 GB RAM create_worker("libc-x86_64-windows", properties={'jobs': 32}, max_builds=2), - # Debian arm32 single core, 512 MB RAM backed by 32 GB swap memory - create_worker("libc-arm32-debian", properties={'jobs': 1}, max_builds=1), - # Debian x86_64 AMD Rome 16 CPUs, 64 GB RAM create_worker("libc-arm32-qemu-debian", properties={'jobs': 16}, max_builds=1), @@ -331,19 +327,19 @@ def get_all(): # Ubuntu 22.04 on AWS, x86_64 PS4 target create_worker("sie-linux-worker", properties={'jobs': 40}, max_builds=1), - # 2012 Mac Mini host, 16GB memory: - # - Ubuntu 18.04 in docker container + # Ubuntu 20.04 in docker container on AWS create_worker("doug-worker-1a", properties={'jobs': 8}, max_builds=1), - # - Ubuntu 22.04 in docker container + # Ubuntu 22.04 in docker container on 2012 Mac Mini create_worker("doug-worker-1b", properties={'jobs': 8}, max_builds=1), # Ubuntu 18.04 in docker container on Ryzen 4800U create_worker("doug-worker-2a", properties={'jobs': 16}, max_builds=1), - # Ubuntu 20.04 on AWS, AMD EPYC 7R13 shared + # Ubuntu 22.04 on AWS create_worker("sie-linux-worker2", max_builds=1), + # Ubuntu 20.04 on AWS create_worker("sie-linux-worker3", max_builds=1), # Ubuntu 22.04 on AWS, x86_64 PS5 target create_worker("sie-linux-worker4", properties={'jobs': 40}, max_builds=1), - # Ubuntu 22.04 on AWS + # Ubuntu 24.04 on AWS create_worker("sie-linux-worker5", max_builds=1), # Windows Server 2019 on AWS, x86_64 PS4 target @@ -351,9 +347,10 @@ def get_all(): # Mac target, Intel Core i7-8700B, 64GB create_worker("doug-worker-3", properties={'jobs': 12}, max_builds=1), + # Mac target, Apple M2 Pro, 32GB + create_worker("doug-worker-4", max_builds=1), # Mac target, Apple M1, 16GB - create_worker("doug-worker-4", properties={'jobs': 8}, max_builds=1), - create_worker("doug-worker-5", properties={'jobs': 8}, max_builds=1), + create_worker("doug-worker-5", max_builds=1), # Ubuntu 20.04, AMD Ryzen 5 PRO 3400GE, 32GB create_worker("doug-worker-6", properties={'jobs': 8}, max_builds=1), @@ -408,14 +405,147 @@ def get_all(): create_worker("rise-worker-3", properties={'jobs' : 32}, max_builds=1), create_worker("rise-worker-4", properties={'jobs' : 32}, max_builds=1), + # Builders that run the premerge configuration + # These workers are specifically for running the premerge configuration + # postcommit (after changes have landed in main). The workers for the + # infrastructure that runs the checks premerge are setup through Github + # Actions under the premerge/ folder in llvm-zorg. + create_worker( + "premerge-us-central-linux-b1", + properties={"jobs": 64}, + max_builds=1, + notify_on_missing=[ + "llvm-presubmit-infra@google.com", + "aidengrossman@google.com", + "cmtice@google.com", + ], + ), + create_worker( + "premerge-us-central-linux-b2", + properties={"jobs": 64}, + max_builds=1, + notify_on_missing=[ + "llvm-presubmit-infra@google.com", + "aidengrossman@google.com", + "cmtice@google.com", + ], + ), + create_worker( + "premerge-us-central-linux-b3", + properties={"jobs": 64}, + max_builds=1, + notify_on_missing=[ + "llvm-presubmit-infra@google.com", + "aidengrossman@google.com", + "cmtice@google.com", + ], + ), + create_worker( + "premerge-us-central-windows-b1", + properties={"jobs": 64}, + max_builds=1, + notify_on_missing=[ + "llvm-presubmit-infra@google.com", + "aidengrossman@google.com", + "cmtice@google.com", + ], + ), + create_worker( + "premerge-us-central-windows-b2", + properties={"jobs": 64}, + max_builds=1, + notify_on_missing=[ + "llvm-presubmit-infra@google.com", + "aidengrossman@google.com", + "cmtice@google.com", + ], + ), + create_worker( + "premerge-us-central-windows-b3", + properties={"jobs": 64}, + max_builds=1, + notify_on_missing=[ + "llvm-presubmit-infra@google.com", + "aidengrossman@google.com", + "cmtice@google.com", + ], + ), + create_worker( + "premerge-us-west-linux-b1", + properties={"jobs": 64}, + max_builds=1, + notify_on_missing=[ + "llvm-presubmit-infra@google.com", + "aidengrossman@google.com", + "cmtice@google.com", + ], + ), + create_worker( + "premerge-us-west-linux-b2", + properties={"jobs": 64}, + max_builds=1, + notify_on_missing=[ + "llvm-presubmit-infra@google.com", + "aidengrossman@google.com", + "cmtice@google.com", + ], + ), + create_worker( + "premerge-us-west-linux-b3", + properties={"jobs": 64}, + max_builds=1, + notify_on_missing=[ + "llvm-presubmit-infra@google.com", + "aidengrossman@google.com", + "cmtice@google.com", + ], + ), + create_worker( + "premerge-us-west-windows-b1", + properties={"jobs": 64}, + max_builds=1, + notify_on_missing=[ + "llvm-presubmit-infra@google.com", + "aidengrossman@google.com", + "cmtice@google.com", + ], + ), + create_worker( + "premerge-us-west-windows-b2", + properties={"jobs": 64}, + max_builds=1, + notify_on_missing=[ + "llvm-presubmit-infra@google.com", + "aidengrossman@google.com", + "cmtice@google.com", + ], + ), + create_worker( + "premerge-us-west-windows-b3", + properties={"jobs": 64}, + max_builds=1, + notify_on_missing=[ + "llvm-presubmit-infra@google.com", + "aidengrossman@google.com", + "cmtice@google.com", + ], + ), + # Workers for the profcheck configuration + # These workers run builds with LLVM_ENABLE_PROFCHECK=ON to ensure + # that profile information is propagated correctly. + create_worker( + "profcheck-b1", + properties={"jobs": 64}, + max_builds=1, + notify_on_missing=["profcheck-buildbot@google.com"], + ), + create_worker( + "profcheck-b2", + properties={"jobs": 64}, + max_builds=1, + notify_on_missing=["profcheck-buildbot@google.com"], + ), # FIXME: A placeholder for annoying worker which nobody could stop. # adding it avoid logs spammed by failed authentication for that worker. create_worker("mlir-ubuntu-worker0"), - - # Linux builder matching Buildkite pre-merge checks configuration. - create_worker("premerge-linux-1", max_builds=1, missing_timeout=300, - notify_on_missing="llvm-premerge-buildbots@google.com"), - # Windows builder matching Buildkite pre-merge checks configuration. - create_worker("premerge-windows-1", max_builds=1, missing_timeout=300, - notify_on_missing="llvm-premerge-buildbots@google.com"), ] diff --git a/llvm-ops-metrics/ops-container/process_llvm_commits.py b/llvm-ops-metrics/ops-container/process_llvm_commits.py deleted file mode 100644 index fdf20cc91..000000000 --- a/llvm-ops-metrics/ops-container/process_llvm_commits.py +++ /dev/null @@ -1,228 +0,0 @@ -import dataclasses -import datetime -import logging -import os -import git -from google.cloud import bigquery -import requests - -GRAFANA_URL = ( - "https://influx-prod-13-prod-us-east-0.grafana.net/api/v1/push/influx/write" -) -REPOSITORY_URL = "https://github.com/llvm/llvm-project.git" - -# Number of days to look back for new commits -# We allow some buffer time between when a commit is made and when it is queried -# for reviews. This is allow time for any events to propogate in the GitHub -# Archive BigQuery tables. -LOOKBACK_DAYS = 2 - -# Template query to find pull requests associated with commits on a given day. -# Searches for pull requests within a lower and upper bound of Github Archive -# event dates. -GITHUB_ARCHIVE_REVIEW_QUERY = """ -WITH PullRequestReviews AS ( - SELECT DISTINCT - JSON_VALUE(payload, '$.pull_request.id') AS pr_id, - JSON_VALUE(payload, '$.review.state') as review_state, - FROM `githubarchive.day.20*` - WHERE - repo.id = 75821432 - AND `type` = 'PullRequestReviewEvent' - AND (_TABLE_SUFFIX BETWEEN '{lower_review_bound}' AND '{upper_review_bound}') -) -SELECT DISTINCT - JSON_VALUE(pr_event.payload, '$.pull_request.merge_commit_sha') AS merge_commit_sha, - JSON_VALUE(pr_event.payload, '$.pull_request.number') AS pull_request_number, - pr_review.review_state as review_state -FROM `githubarchive.day.{commit_date}` AS pr_event -LEFT JOIN PullRequestReviews as pr_review ON - JSON_VALUE(pr_event.payload, '$.pull_request.id') = pr_review.pr_id # PR ID should match the review events -WHERE - pr_event.repo.id = 75821432 - AND pr_event.`type` = 'PullRequestEvent' - AND JSON_VALUE(pr_event.payload, '$.pull_request.merge_commit_sha') IS NOT NULL -""" - - -@dataclasses.dataclass -class LLVMCommitInfo: - commit_sha: str - commit_datetime: datetime.datetime - commit_timestamp_seconds: int - has_pull_request: bool = False - pr_number: int = 0 - is_reviewed: bool = False - is_approved: bool = False - - -def scrape_new_commits_by_date( - target_datetime: datetime.datetime, -) -> list[git.Commit]: - """Scrape new commits from a given dates. - - Args: - target_datetime: The date to scrape for new commits. - - Returns: - List of new commits made on the given date. - """ - # Clone repository to current working directory - repo = git.Repo.clone_from( - url=REPOSITORY_URL, - to_path="./llvm-project", - ) - - # Scrape for new commits - # iter_commits() yields commits in reverse chronological order - new_commits = [] - for commit in repo.iter_commits(): - # Skip commits that don't match the target date - committed_datetime = commit.committed_datetime.astimezone( - datetime.timezone.utc - ) - if committed_datetime.date() != target_datetime.date(): - continue - - new_commits.append(commit) - - logging.info("Found %d new commits", len(new_commits)) - return new_commits - - -def query_for_reviews( - new_commits: list[git.Commit], commit_datetime: datetime.datetime -) -> list[LLVMCommitInfo]: - """Query GitHub Archive BigQuery for reviews of new commits. - - Args: - new_commits: List of new commits to query for reviews. - commit_datetime: The date that the new commits were made on. - - Returns: - List of LLVMCommitInfo objects for each commit's review information. - """ - - # Search for reviews in the last 4 weeks - earliest_review_date = ( - commit_datetime - datetime.timedelta(weeks=4) - ).strftime("%Y%m%d") - latest_review_date = datetime.datetime.now(datetime.timezone.utc).strftime( - "%Y%m%d" - ) - - # Create a map of commit sha to info - new_commits = { - commit.hexsha: LLVMCommitInfo( - commit.hexsha, commit.committed_datetime, commit.committed_date - ) - for commit in new_commits - } - - # Query each relevant daily GitHub Archive table - query = GITHUB_ARCHIVE_REVIEW_QUERY.format( - commit_date=commit_datetime.strftime("%Y%m%d"), - lower_review_bound=earliest_review_date.removeprefix("20"), - upper_review_bound=latest_review_date.removeprefix("20"), - ) - bq_client = bigquery.Client() - query_job = bq_client.query(query) - results = query_job.result() - - # Process each found merge commit - for row in results: - # If this commit is irrelevant, skip it - # Not every merge_commit_sha makes it into main, a "merge commit" can mean - # different things depending on the state of the pull request. - # docs.github.com/en/rest/pulls/pulls#get-a-pull-request for more details. - merge_commit_sha = row["merge_commit_sha"] - if merge_commit_sha not in new_commits: - continue - - commit_info = new_commits[merge_commit_sha] - commit_info.has_pull_request = True - commit_info.pr_number = row["pull_request_number"] - commit_info.is_reviewed = row["review_state"] is not None - commit_info.is_approved = row["review_state"] == "approved" - - logging.info( - "Total gigabytes processed: %d GB", - query_job.total_bytes_processed / (1024**3), - ) - - return list(new_commits.values()) - - -def upload_daily_metrics( - grafana_api_key: str, - grafana_metrics_userid: str, - new_commits: list[LLVMCommitInfo], -) -> None: - """Upload daily commit metrics to Grafana. - - Args: - grafana_api_key: The key to make API requests with. - grafana_metrics_userid: The user to make API requests with. - new_commits: List of commits to process & upload to Grafana. - - Returns: - None - """ - # Count each type of commit made - approval_count = 0 - review_count = 0 - pull_request_count = 0 - push_count = 0 - for commit in new_commits: - if commit.is_approved: - approval_count += 1 - elif commit.is_reviewed: - review_count += 1 - elif commit.has_pull_request: - pull_request_count += 1 - else: - push_count += 1 - - # Post data via InfluxDB API call - request_data = ( - "llvm_project_main_daily_commits" - " approval_count={},review_count={},pull_request_count={},push_count={}" - ).format(approval_count, review_count, pull_request_count, push_count) - response = requests.post( - GRAFANA_URL, # Set timestamp precision to seconds - headers={"Content-Type": "text/plain"}, - data=request_data, - auth=(grafana_metrics_userid, grafana_api_key), - ) - - if response.status_code < 200 or response.status_code >= 300: - logging.error("Failed to submit data to Grafana: %s", response.text) - - -def main() -> None: - grafana_api_key = os.environ["GRAFANA_API_KEY"] - grafana_metrics_userid = os.environ["GRAFANA_METRICS_USERID"] - - # Scrape new commits - date_to_scrape = datetime.datetime.now( - datetime.timezone.utc - ) - datetime.timedelta(days=LOOKBACK_DAYS) - logging.info( - "Cloning and scraping llvm/llvm-project for new commits on %s", - date_to_scrape.strftime("%Y-%m-%d"), - ) - new_commits = scrape_new_commits_by_date(date_to_scrape) - if not new_commits: - logging.info("No new commits found. Exiting.") - return - - logging.info("Querying for reviews of new commits.") - new_commit_info = query_for_reviews(new_commits, date_to_scrape) - - logging.info("Uploading metrics to Grafana.") - upload_daily_metrics(grafana_api_key, grafana_metrics_userid, new_commit_info) - - -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO) - main() diff --git a/premerge/bigquery_schema/llvm_commits_table_schema.json b/premerge/bigquery_schema/llvm_commits_table_schema.json new file mode 100644 index 000000000..7ddd27e14 --- /dev/null +++ b/premerge/bigquery_schema/llvm_commits_table_schema.json @@ -0,0 +1,96 @@ +[ + { + "name": "commit_sha", + "type": "STRING", + "mode": "NULLABLE", + "description": "Commit hexsha of a commit made to llvm/llvm-project:main" + }, + { + "name": "commit_author", + "type": "STRING", + "mode": "NULLABLE", + "description": "GitHub username of the commit author" + }, + { + "name": "commit_timestamp_seconds", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "Time this commit was made at, as a Unix timestamp" + }, + { + "name": "has_pull_request", + "type": "BOOLEAN", + "mode": "NULLABLE", + "description": "Whether or not this commit has an associated pull request" + }, + { + "name": "pull_request_number", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "Number of the pull request associated with this commit" + }, + { + "name": "is_reviewed", + "type": "BOOLEAN", + "mode": "NULLABLE", + "description": "Whether or not the pull request for this commit was reviewed" + }, + { + "name": "is_approved", + "type": "BOOLEAN", + "mode": "NULLABLE", + "description": "Whether or not the pull request for this commit was approved" + }, + { + "name": "reviewers", + "type": "STRING", + "mode": "REPEATED", + "description": "List of GitHub users who reviewed the pull request for this commit" + }, + { + "name": "is_revert", + "type": "BOOLEAN", + "mode": "NULLABLE", + "description": "Whether or not this commit is a revert" + }, + { + "name": "pull_request_reverted", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "Pull request matched in revert message. Not reliable for determining if a PR was reverted, `commit_reverted` may contain a commit belonging to a PR" + }, + { + "name": "commit_reverted", + "type": "STRING", + "mode": "NULLABLE", + "description": "Commit sha matched in revert message. Not reliable for determining if a commit was reverted, `pull_request_reverted` may contain a PR contributing a commit" + }, + { + "name": "diff", + "type": "RECORD", + "mode": "REPEATED", + "description": "List of files and line addition/deletion counts for this commit", + "fields": [ + { + "name": "file", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "additions", + "type": "INTEGER", + "mode": "NULLABLE" + }, + { + "name": "deletions", + "type": "INTEGER", + "mode": "NULLABLE" + }, + { + "name": "total", + "type": "INTEGER", + "mode": "NULLABLE" + } + ] + } +] diff --git a/premerge/buildbot/Dockerfile b/premerge/buildbot/Dockerfile new file mode 100644 index 000000000..c295dd6d0 --- /dev/null +++ b/premerge/buildbot/Dockerfile @@ -0,0 +1,13 @@ +FROM ubuntu:24.04 +RUN apt-get update && apt-get install -y \ + python3 \ + python3-pip \ + git \ + python-is-python3 +COPY requirements.lock.txt /requirements.lock.txt +RUN pip3 install --break-system-packages -r /requirements.lock.txt && rm /requirements.lock.txt +RUN mkdir /app +WORKDIR /app +COPY startup.sh . +RUN chmod +x startup.sh +ENTRYPOINT /app/startup.sh diff --git a/premerge/buildbot/requirements.lock.txt b/premerge/buildbot/requirements.lock.txt new file mode 100644 index 000000000..a49faeed5 --- /dev/null +++ b/premerge/buildbot/requirements.lock.txt @@ -0,0 +1,92 @@ +# +# This file is autogenerated by pip-compile with Python 3.12 +# by the following command: +# +# pip-compile --output-file=requirements.lock.txt requirements.txt +# +attrs==25.3.0 + # via twisted +autobahn==24.4.2 + # via buildbot-worker +automat==25.4.16 + # via twisted +buildbot-worker==3.11.7 + # via -r requirements.txt +cachetools==5.5.2 + # via google-auth +certifi==2025.7.14 + # via + # kubernetes + # requests +cffi==1.17.1 + # via cryptography +charset-normalizer==3.4.2 + # via requests +constantly==23.10.4 + # via twisted +cryptography==45.0.5 + # via autobahn +durationpy==0.10 + # via kubernetes +google-auth==2.40.3 + # via kubernetes +hyperlink==21.0.0 + # via + # autobahn + # twisted +idna==3.10 + # via + # hyperlink + # requests +incremental==24.7.2 + # via twisted +kubernetes==33.1.0 + # via -r requirements.txt +msgpack==1.1.1 + # via buildbot-worker +oauthlib==3.3.1 + # via + # kubernetes + # requests-oauthlib +pyasn1==0.6.1 + # via + # pyasn1-modules + # rsa +pyasn1-modules==0.4.2 + # via google-auth +pycparser==2.22 + # via cffi +python-dateutil==2.9.0.post0 + # via kubernetes +pyyaml==6.0.2 + # via kubernetes +requests==2.32.4 + # via + # kubernetes + # requests-oauthlib +requests-oauthlib==2.0.0 + # via kubernetes +rsa==4.9.1 + # via google-auth +six==1.17.0 + # via + # buildbot-worker + # kubernetes + # python-dateutil +twisted==25.5.0 + # via buildbot-worker +txaio==25.6.1 + # via autobahn +typing-extensions==4.14.1 + # via twisted +urllib3==2.5.0 + # via + # kubernetes + # requests +websocket-client==1.8.0 + # via kubernetes +zope-interface==7.2 + # via twisted + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/premerge/buildbot/requirements.txt b/premerge/buildbot/requirements.txt new file mode 100644 index 000000000..dfd1b8447 --- /dev/null +++ b/premerge/buildbot/requirements.txt @@ -0,0 +1,2 @@ +kubernetes==33.1.0 +buildbot-worker==3.11.7 diff --git a/premerge/buildbot/startup.sh b/premerge/buildbot/startup.sh new file mode 100644 index 000000000..e7429b633 --- /dev/null +++ b/premerge/buildbot/startup.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# This script performs all the necessary setup and then starts the buildbot +# worker. + +mkdir /worker +buildbot-worker create-worker /worker \ + lab.llvm.org:9994 \ + $BUILDBOT_USERNAME \ + $BUILDBOT_PASSWORD + +echo "Google LLVM Premerge Infra Rotation " \ + > /worker/info/admin + +{ + echo "Premerge container (https://github.com/llvm/llvm-project/pkgs/container/ci-ubuntu-24.04)" + echo "GCP n2/n2d standard instances." +} > /worker/info/host + +buildbot-worker start /worker + +sleep 31536000000 diff --git a/premerge/buildbot_deployment.yaml b/premerge/buildbot_deployment.yaml new file mode 100644 index 000000000..403f22e01 --- /dev/null +++ b/premerge/buildbot_deployment.yaml @@ -0,0 +1,37 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ${ buildbot_name } + namespace: ${ buildbot_namespace } + labels: + app: ${ buildbot_name } +spec: + selector: + matchLabels: + app: ${ buildbot_name } + template: + metadata: + labels: + app: ${ buildbot_name } + spec: + serviceAccountName: buildbot-ksa + containers: + - name: buildbot + image: ghcr.io/llvm/premerge-buildbot:latest + env: + - name: BUILDBOT_USERNAME + value: ${ buildbot_name } + - name: BUILDBOT_PASSWORD + valueFrom: + secretKeyRef: + name: ${ secret_name } + key: password + - name: BUILDBOT_REGION + value: ${ buildbot_region } + resources: + requests: + memory: "512Mi" + cpu: "500m" + limits: + memory: "1Gi" + cpu: "750m" diff --git a/premerge/cluster-management.md b/premerge/cluster-management.md index 3caec1e2a..e9ba4cc02 100644 --- a/premerge/cluster-management.md +++ b/premerge/cluster-management.md @@ -219,9 +219,9 @@ terraform destroy -target module.premerge_cluster_us_central_resources.kubernete ### Bumping the Version Number This is necessary only for bumping the version of ARC. This involves simply -updating the version field for the `premerge_resources` objects in `main.tf`. -Each premerge cluster (`llvm-premerge-cluster-us-central` and -`llvm-premerge-cluster-us-west`) has a separate version. This allows for +updating the `github_arc_version` field for premerge cluster resources in +`premerge/main.tf`. Each premerge cluster (`llvm-premerge-cluster-us-central` +and `llvm-premerge-cluster-us-west`) has a separate version. This allows for updating them separately which allows for zero-downtime upgrades when the system is operating at low capacity. Make sure to commit the changes and push them to `llvm-zorg` to ensure others working on the terraform configuration @@ -237,3 +237,103 @@ ensure they are in a state consistent with the terraform IaC definitions. [Strategies for Upgrading ARC](https://www.kenmuse.com/blog/strategies-for-upgrading-arc/) outlines how ARC should be upgraded and why. + +## Grafana tokens + +The cluster has multiple services communicating with Grafana Cloud: + - the metrics container + - per-node monitoring (Grafana Alloy, Prometheus node exporter) + - per-cluster monitoring (Opencost, Alloy) + +The full description of the services can be found on the [k8s-monitoring Helm +chart repository](https://github.com/grafana/k8s-monitoring-helm). + +Authentication to Grafana Cloud is handled through `Cloud access policies`. +Currently, the cluster uses 2 kind of tokens: + + - `llvm-premerge-metrics-grafana-api-key` + Used by: metrics container + Scopes: `metrics:write` + + - `llvm-premerge-grafana-token` + Used by: Alloy, Prometheus node exporter & other services. + Scopes: `metrics:read`, `metrics:write`, `logs:write` + +We've setup 2 cloud policies with matching names so scopes are already set up. +If you need to rotate tokens, you need to: + + 1. Login to Grafana Cloud + 2. Navigate to `Home > Administration > Users and Access > Cloud Access Policies` + 3. Create a new token in the desired cloud access policy. + 4. Log in `GCP > Security > Secret Manager` + 5. Click on the secret to update. + 6. Click on `New version` + 7. Paste the token displayed in Grafana and tick `Disable all past versions`. + +At this stage, you should have a **single** enabled secret on GCP. If you +display the value, you should see the Grafana token. + +Then, go in the `llvm-zorg` repository. Make sure you pulled the last changes +in `main`, and then as usual, run `terraform apply`. + +At this stage, you made sure newly created services will use the token, but +existing deployment still rely on the old tokens. You need to manually restart +the deployments on both `us-west1` and `us-central1-a` clusters. + +Run: + +``` bash +gcloud container clusters get-credentials llvm-premerge-cluster-us-west --location us-west1 +kubectl scale --replicas=0 --namespace grafana deployments \ + grafana-k8s-monitoring-opencost \ + grafana-k8s-monitoring-kube-state-metrics \ + grafana-k8s-monitoring-alloy-events + +gcloud container clusters get-credentials llvm-premerge-cluster-us-central --location us-central1-a +kubectl scale --replicas=0 --namespace grafana deployments \ + grafana-k8s-monitoring-opencost \ + grafana-k8s-monitoring-kube-state-metrics \ + grafana-k8s-monitoring-alloy-events +kubectl scale --replicas=0 --namespace metrics +``` + +:warning: metrics namespace only exists in the `us-central1-a` cluster. + +Wait until the command `kubectl get deployments --namespace grafana` shows +all deployments have been scaled down to zero. Then run: + +```bash +gcloud container clusters get-credentials llvm-premerge-cluster-us-west --location us-west1 +kubectl scale --replicas=0 --namespace grafana deployments \ + grafana-k8s-monitoring-opencost \ + grafana-k8s-monitoring-kube-state-metrics \ + grafana-k8s-monitoring-alloy-events + +gcloud container clusters get-credentials llvm-premerge-cluster-us-central --location us-central1-a +kubectl scale --replicas=1 --namespace grafana deployments \ + grafana-k8s-monitoring-opencost \ + grafana-k8s-monitoring-kube-state-metrics \ + grafana-k8s-monitoring-alloy-events +kubectl scale --replicas=1 --namespace metrics metrics +``` + +You can check the restarted service logs for errors. If the token is invalid +or the scope bad, you should see some `401` error codes. + +```bash +kubectl logs -n metrics deployment/metrics +kubectl logs -n metrics deployment/grafana-k8s-monitoring-opencost +``` + +At this stage, all long-lived services should be using the new tokens. +**DO NOT DELETE THE OLD TOKEN YET**. +The existing CI jobs can be quite long-lived. We need to wait for them to +finish. New CI jobs will pick up the new tokens. + +After 24 hours, log back in +`Administration > User and Access > Cloud Access policies` and expand the +token lists. +You should see the new tokens `Last used at` being about a dozen minutes at +most, while old tokens should remain unused for several hours. +If this is the case, congratulations, you've successfully rotated security +tokens! You can now safely delete the old unused tokens. diff --git a/premerge/gke_cluster/main.tf b/premerge/gke_cluster/main.tf index 832d23c4f..cc01357ea 100644 --- a/premerge/gke_cluster/main.tf +++ b/premerge/gke_cluster/main.tf @@ -19,6 +19,16 @@ resource "google_container_cluster" "llvm_premerge" { workload_identity_config { workload_pool = "llvm-premerge-checks.svc.id.goog" } + + # We prefer that maintenance is done on weekends between 02:00 and 08:00 + # UTC when commit traffic is low to avoid interruptions. + maintenance_policy { + recurring_window { + start_time = "2025-07-24T02:00:00Z" + end_time = "2025-07-24T08:00:00Z" + recurrence = "FREQ=WEEKLY;BYDAY=SA,SU" + } + } } resource "google_container_node_pool" "llvm_premerge_linux_service" { @@ -29,18 +39,11 @@ resource "google_container_node_pool" "llvm_premerge_linux_service" { node_locations = var.service_node_pool_locations node_config { - machine_type = "e2-highcpu-4" + machine_type = "e2-standard-4" workload_metadata_config { mode = "GKE_METADATA" } - # Terraform wants to recreate the node pool everytime whe running - # terraform apply unless we explicitly set this. - # TODO(boomanaiden154): Look into why terraform is doing this so we do - # not need this hack. - resource_labels = { - "goog-gke-node-pool-provisioning-model" = "on-demand" - } } } @@ -66,13 +69,40 @@ resource "google_container_node_pool" "llvm_premerge_linux" { "premerge-platform" : "linux" } disk_size_gb = 200 - # Terraform wants to recreate the node pool everytime whe running - # terraform apply unless we explicitly set this. - # TODO(boomanaiden154): Look into why terraform is doing this so we do - # not need this hack. - resource_labels = { - "goog-gke-node-pool-provisioning-model" = "on-demand" + + # Enable workload identity federation for this pool so that we can access + # GCS buckets. + workload_metadata_config { + mode = "GKE_METADATA" + } + } +} + +# Buildbot here refers specifically to the LLVM Buildbot postcommit +# testing infrastructure. These machines are used specifically for testing +# commits after they have landed in main. +resource "google_container_node_pool" "llvm_buildbot_linux" { + name = "llvm-buildbot-linux" + location = var.region + cluster = google_container_cluster.llvm_premerge.name + initial_node_count = 0 + + autoscaling { + total_min_node_count = 0 + total_max_node_count = 3 + } + + node_config { + machine_type = var.linux_machine_type + taint { + key = "buildbot-platform" + value = "linux" + effect = "NO_SCHEDULE" } + labels = { + "buildbot-platform" : "linux" + } + disk_size_gb = 200 # Enable workload identity federation for this pool so that we can access # GCS buckets. @@ -104,13 +134,6 @@ resource "google_container_node_pool" "llvm_premerge_libcxx" { "premerge-platform-libcxx" : "linux-libcxx" } disk_size_gb = 200 - # Terraform wants to recreate the node pool everytime whe running - # terraform apply unless we explicitly set this. - # TODO(boomanaiden154): Look into why terraform is doing this so we do - # not need this hack. - resource_labels = { - "goog-gke-node-pool-provisioning-model" = "on-demand" - } } } @@ -149,13 +172,56 @@ resource "google_container_node_pool" "llvm_premerge_windows_2022" { } disk_size_gb = 200 disk_type = "pd-ssd" - # Terraform wants to recreate the node pool everytime whe running - # terraform apply unless we explicitly set this. - # TODO(boomanaiden154): Look into why terraform is doing this so we do - # not need this hack. - resource_labels = { - "goog-gke-node-pool-provisioning-model" = "on-demand" + + # Enable workload identity federation for this pool so that we can access + # GCS buckets. + workload_metadata_config { + mode = "GKE_METADATA" } + } +} + +# Buildbot here refers specifically to the LLVM Buildbot postcommit +# testing infrastructure. These machines are used specifically for testing +# commits after they have landed in main. +resource "google_container_node_pool" "llvm_buildbot_window_2022" { + name = "llvm-buildbot-windows-2022" + location = var.region + cluster = google_container_cluster.llvm_premerge.name + initial_node_count = 0 + + autoscaling { + total_min_node_count = 0 + total_max_node_count = 3 + } + + # We do not set a taint for the windows nodes as kubernetes by default sets + # a node.kubernetes.io/os taint for windows nodes. + node_config { + # Use the Linux machine type here as we want to keep the windows machines + # symmetric with the Linux machines for faster builds. Throughput is not + # as much of a concern postcommit. + machine_type = var.linux_machine_type + labels = { + "buildbot-platform" : "windows-2022" + } + image_type = "WINDOWS_LTSC_CONTAINERD" + windows_node_config { + osversion = "OS_VERSION_LTSC2022" + } + # Add a script that runs on the initial boot to disable Windows Defender. + # Windows Defender causes an increase in test times by approximately an + # order of magnitude. + metadata = { + "sysprep-specialize-script-ps1" = "Set-MpPreference -DisableRealtimeMonitoring $true" + # Terraform wants to recreate the node pool everytime whe running + # terraform apply unless we explicitly set this. + # TODO(boomanaiden154): Look into why terraform is doing this so we do + # not need this hack. + "disable-legacy-endpoints" = "true" + } + disk_size_gb = 200 + disk_type = "pd-ssd" # Enable workload identity federation for this pool so that we can access # GCS buckets. @@ -171,6 +237,19 @@ resource "google_storage_bucket" "object_cache_linux" { uniform_bucket_level_access = true public_access_prevention = "enforced" + + soft_delete_policy { + retention_duration_seconds = 0 + } + + lifecycle_rule { + action { + type = "Delete" + } + condition { + age = 7 + } + } } resource "google_storage_bucket" "object_cache_windows" { @@ -179,6 +258,19 @@ resource "google_storage_bucket" "object_cache_windows" { uniform_bucket_level_access = true public_access_prevention = "enforced" + + soft_delete_policy { + retention_duration_seconds = 0 + } + + lifecycle_rule { + action { + type = "Delete" + } + condition { + age = 7 + } + } } resource "google_service_account" "object_cache_linux_gsa" { @@ -191,11 +283,22 @@ resource "google_service_account" "object_cache_windows_gsa" { display_name = format("%s Windows Object Cache Service Account", var.region) } +resource "google_service_account" "object_cache_linux_buildbot_gsa" { + account_id = format("%s-linux-buildbot", var.gcs_bucket_location) + display_name = format("%s Linux Object Cache Buildbot Service Account", var.region) +} + +resource "google_service_account" "object_cache_windows_buildbot_gsa" { + account_id = format("%s-windows-buildbot", var.gcs_bucket_location) + display_name = format("%s Windows Object Cache Buildbot Service Account", var.region) +} + resource "google_storage_bucket_iam_binding" "linux_bucket_binding" { bucket = google_storage_bucket.object_cache_linux.name role = "roles/storage.objectUser" members = [ format("serviceAccount:%s", google_service_account.object_cache_linux_gsa.email), + format("serviceAccount:%s", google_service_account.object_cache_linux_buildbot_gsa.email), ] depends_on = [ @@ -209,6 +312,7 @@ resource "google_storage_bucket_iam_binding" "windows_bucket_binding" { role = "roles/storage.objectUser" members = [ format("serviceAccount:%s", google_service_account.object_cache_windows_gsa.email), + format("serviceAccount:%s", google_service_account.object_cache_windows_buildbot_gsa.email), ] depends_on = [ @@ -242,3 +346,29 @@ resource "google_service_account_iam_binding" "windows_bucket_gsa_workload_bindi google_service_account.object_cache_windows_gsa, ] } + +resource "google_service_account_iam_binding" "linux_bucket_buildbot_gsa_workload_binding" { + service_account_id = google_service_account.object_cache_linux_buildbot_gsa.name + role = "roles/iam.workloadIdentityUser" + + members = [ + "serviceAccount:${google_service_account.object_cache_linux_buildbot_gsa.project}.svc.id.goog[llvm-premerge-linux-buildbot/buildbot-gcs-ksa]", + ] + + depends_on = [ + google_service_account.object_cache_linux_buildbot_gsa, + ] +} + +resource "google_service_account_iam_binding" "windows_bucket_buildbot_gsa_workload_binding" { + service_account_id = google_service_account.object_cache_windows_buildbot_gsa.name + role = "roles/iam.workloadIdentityUser" + + members = [ + "serviceAccount:${google_service_account.object_cache_windows_buildbot_gsa.project}.svc.id.goog[llvm-premerge-windows-2022-buildbot/buildbot-gcs-ksa]", + ] + + depends_on = [ + google_service_account.object_cache_windows_buildbot_gsa, + ] +} diff --git a/premerge/gke_cluster/outputs.tf b/premerge/gke_cluster/outputs.tf index 38b9c191d..76124e9a0 100644 --- a/premerge/gke_cluster/outputs.tf +++ b/premerge/gke_cluster/outputs.tf @@ -21,3 +21,11 @@ output "linux_object_cache_gcp_service_account_email" { output "windows_2022_object_cache_gcp_service_account_email" { value = google_service_account.object_cache_windows_gsa.email } + +output "linux_object_cache_buildbot_service_account_email" { + value = google_service_account.object_cache_linux_buildbot_gsa.email +} + +output "windows_2022_object_cache_buildbot_service_account_email" { + value = google_service_account.object_cache_windows_buildbot_gsa.email +} diff --git a/premerge/libcxx_runners_values.yaml b/premerge/libcxx_runners_values.yaml index 83585826e..ab41ddaab 100644 --- a/premerge/libcxx_runners_values.yaml +++ b/premerge/libcxx_runners_values.yaml @@ -21,7 +21,7 @@ template: containers: - name: runner image: ${ runner_image } - command: ["/home/runner/run.sh"] + command: ["${ command }"] resources: # If we don't set the CPU request high-enough here, 2 runners might # be scheduled on the same pod, meaning 2 jobs, and they will starve diff --git a/premerge/main.tf b/premerge/main.tf index b3bf4afc7..8fbbb8988 100644 --- a/premerge/main.tf +++ b/premerge/main.tf @@ -121,6 +121,25 @@ data "google_secret_manager_secret_version" "grafana_token" { secret = "llvm-premerge-testing-grafana-token" } +# Buildbot here refers specifically to the LLVM Buildbot postcommit +# testing infrastructure. These machines are used specifically for testing +# commits after they have landed in main. +data "google_secret_manager_secret_version" "us_central_linux_buildbot_password" { + secret = "llvm-buildbot-linux-us-central" +} + +data "google_secret_manager_secret_version" "us_central_windows_buildbot_password" { + secret = "llvm-buildbot-windows-us-central" +} + +data "google_secret_manager_secret_version" "us_west_linux_buildbot_password" { + secret = "llvm-buildbot-linux-us-west" +} + +data "google_secret_manager_secret_version" "us_west_windows_buildbot_password" { + secret = "llvm-buildbot-windows-us-west" +} + provider "kubernetes" { host = "https://${module.premerge_cluster_us_central.endpoint}" token = data.google_client_config.current.access_token @@ -138,20 +157,26 @@ provider "kubernetes" { } module "premerge_cluster_us_central_resources" { - source = "./premerge_resources" - github_app_id = data.google_secret_manager_secret_version.github_app_id.secret_data - github_app_installation_id = data.google_secret_manager_secret_version.github_app_installation_id.secret_data - github_app_private_key = data.google_secret_manager_secret_version.github_app_private_key.secret_data - cluster_name = "llvm-premerge-cluster-us-central" - grafana_token = data.google_secret_manager_secret_version.grafana_token.secret_data - runner_group_name = "llvm-premerge-cluster-us-central" - linux_runners_namespace_name = local.linux_runners_namespace_name - linux_runners_kubernetes_service_account_name = local.linux_runners_kubernetes_service_account_name - windows_2022_runners_namespace_name = local.windows_2022_runners_namespace_name - windows_2022_runners_kubernetes_service_account_name = local.windows_2022_runners_kubernetes_service_account_name - linux_object_cache_gcp_service_account_email = module.premerge_cluster_us_central.linux_object_cache_gcp_service_account_email - windows_2022_object_cache_gcp_service_account_email = module.premerge_cluster_us_central.windows_2022_object_cache_gcp_service_account_email - github_arc_version = "0.12.1" + source = "./premerge_resources" + github_app_id = data.google_secret_manager_secret_version.github_app_id.secret_data + github_app_installation_id = data.google_secret_manager_secret_version.github_app_installation_id.secret_data + github_app_private_key = data.google_secret_manager_secret_version.github_app_private_key.secret_data + cluster_name = "llvm-premerge-cluster-us-central" + grafana_token = data.google_secret_manager_secret_version.grafana_token.secret_data + runner_group_name = "llvm-premerge-cluster-us-central" + linux_runners_namespace_name = local.linux_runners_namespace_name + linux_runners_kubernetes_service_account_name = local.linux_runners_kubernetes_service_account_name + windows_2022_runners_namespace_name = local.windows_2022_runners_namespace_name + windows_2022_runners_kubernetes_service_account_name = local.windows_2022_runners_kubernetes_service_account_name + linux_object_cache_gcp_service_account_email = module.premerge_cluster_us_central.linux_object_cache_gcp_service_account_email + windows_2022_object_cache_gcp_service_account_email = module.premerge_cluster_us_central.windows_2022_object_cache_gcp_service_account_email + github_arc_version = "0.12.1" + linux_buildbot_name_template = "premerge-us-central-linux" + linux_buildbot_password = data.google_secret_manager_secret_version.us_central_linux_buildbot_password.secret_data + windows_buildbot_name_template = "premerge-us-central-windows" + windows_buildbot_password = data.google_secret_manager_secret_version.us_central_windows_buildbot_password.secret_data + linux_object_cache_buildbot_service_account_email = module.premerge_cluster_us_central.linux_object_cache_buildbot_service_account_email + windows_2022_object_cache_buildbot_service_account_email = module.premerge_cluster_us_central.windows_2022_object_cache_buildbot_service_account_email providers = { kubernetes = kubernetes.llvm-premerge-us-central helm = helm.llvm-premerge-us-central @@ -159,20 +184,26 @@ module "premerge_cluster_us_central_resources" { } module "premerge_cluster_us_west_resources" { - source = "./premerge_resources" - github_app_id = data.google_secret_manager_secret_version.github_app_id.secret_data - github_app_installation_id = data.google_secret_manager_secret_version.github_app_installation_id.secret_data - github_app_private_key = data.google_secret_manager_secret_version.github_app_private_key.secret_data - cluster_name = "llvm-premerge-cluster-us-west" - grafana_token = data.google_secret_manager_secret_version.grafana_token.secret_data - runner_group_name = "llvm-premerge-cluster-us-west" - linux_runners_namespace_name = local.linux_runners_namespace_name - linux_runners_kubernetes_service_account_name = local.linux_runners_kubernetes_service_account_name - windows_2022_runners_namespace_name = local.windows_2022_runners_namespace_name - windows_2022_runners_kubernetes_service_account_name = local.windows_2022_runners_kubernetes_service_account_name - linux_object_cache_gcp_service_account_email = module.premerge_cluster_us_west.linux_object_cache_gcp_service_account_email - windows_2022_object_cache_gcp_service_account_email = module.premerge_cluster_us_west.windows_2022_object_cache_gcp_service_account_email - github_arc_version = "0.12.1" + source = "./premerge_resources" + github_app_id = data.google_secret_manager_secret_version.github_app_id.secret_data + github_app_installation_id = data.google_secret_manager_secret_version.github_app_installation_id.secret_data + github_app_private_key = data.google_secret_manager_secret_version.github_app_private_key.secret_data + cluster_name = "llvm-premerge-cluster-us-west" + grafana_token = data.google_secret_manager_secret_version.grafana_token.secret_data + runner_group_name = "llvm-premerge-cluster-us-west" + linux_runners_namespace_name = local.linux_runners_namespace_name + linux_runners_kubernetes_service_account_name = local.linux_runners_kubernetes_service_account_name + windows_2022_runners_namespace_name = local.windows_2022_runners_namespace_name + windows_2022_runners_kubernetes_service_account_name = local.windows_2022_runners_kubernetes_service_account_name + linux_object_cache_gcp_service_account_email = module.premerge_cluster_us_west.linux_object_cache_gcp_service_account_email + windows_2022_object_cache_gcp_service_account_email = module.premerge_cluster_us_west.windows_2022_object_cache_gcp_service_account_email + github_arc_version = "0.12.1" + linux_buildbot_name_template = "premerge-us-west-linux" + linux_buildbot_password = data.google_secret_manager_secret_version.us_west_linux_buildbot_password.secret_data + windows_buildbot_name_template = "premerge-us-west-windows" + windows_buildbot_password = data.google_secret_manager_secret_version.us_west_windows_buildbot_password.secret_data + linux_object_cache_buildbot_service_account_email = module.premerge_cluster_us_west.linux_object_cache_buildbot_service_account_email + windows_2022_object_cache_buildbot_service_account_email = module.premerge_cluster_us_west.windows_2022_object_cache_buildbot_service_account_email providers = { kubernetes = kubernetes.llvm-premerge-us-west helm = helm.llvm-premerge-us-west @@ -231,13 +262,10 @@ resource "google_service_account" "operational_metrics_gsa" { display_name = "Operational Metrics GSA" } -resource "google_project_iam_binding" "bigquery_jobuser_binding" { +resource "google_project_iam_member" "operational_metrics_gsa_bq_jobuser_member" { project = google_service_account.operational_metrics_gsa.project role = "roles/bigquery.jobUser" - - members = [ - "serviceAccount:${google_service_account.operational_metrics_gsa.email}", - ] + member = "serviceAccount:${google_service_account.operational_metrics_gsa.email}" depends_on = [google_service_account.operational_metrics_gsa] } @@ -284,9 +312,7 @@ resource "kubernetes_secret" "operational_metrics_secrets" { } data = { - "github-token" = data.google_secret_manager_secret_version.metrics_github_pat.secret_data - "grafana-api-key" = data.google_secret_manager_secret_version.metrics_grafana_api_key.secret_data - "grafana-metrics-userid" = data.google_secret_manager_secret_version.metrics_grafana_metrics_userid.secret_data + "github-token" = data.google_secret_manager_secret_version.metrics_github_pat.secret_data } type = "Opaque" @@ -304,3 +330,30 @@ resource "kubernetes_manifest" "operational_metrics_cronjob" { kubernetes_service_account.operational_metrics_ksa, ] } + +# BigQuery dataset and table resources +resource "google_bigquery_dataset" "operational_metrics_dataset" { + dataset_id = "operational_metrics" + description = "Dataset for retaining operational data regarding LLVM commit trends." +} + +resource "google_bigquery_table" "llvm_commits_table" { + dataset_id = google_bigquery_dataset.operational_metrics_dataset.dataset_id + table_id = "llvm_commits" + description = "LLVM commit data, including pull request and review activity per commit." + + schema = file("./bigquery_schema/llvm_commits_table_schema.json") + + depends_on = [google_bigquery_dataset.operational_metrics_dataset] +} + +resource "google_bigquery_dataset_iam_binding" "operational_metrics_dataset_editor_binding" { + dataset_id = google_bigquery_dataset.operational_metrics_dataset.dataset_id + role = "roles/bigquery.dataEditor" + + members = [ + "serviceAccount:${google_service_account.operational_metrics_gsa.email}", + ] + + depends_on = [google_bigquery_dataset.operational_metrics_dataset, google_service_account.operational_metrics_gsa] +} diff --git a/premerge/operational_metrics_cronjob.yaml b/premerge/operational_metrics_cronjob.yaml index 7c9630147..8058367cd 100644 --- a/premerge/operational_metrics_cronjob.yaml +++ b/premerge/operational_metrics_cronjob.yaml @@ -25,21 +25,14 @@ spec: secretKeyRef: name: operational-metrics-secrets key: github-token - - name: GRAFANA_API_KEY - valueFrom: - secretKeyRef: - name: operational-metrics-secrets - key: grafana-api-key - - name: GRAFANA_METRICS_USERID - valueFrom: - secretKeyRef: - name: operational-metrics-secrets - key: grafana-metrics-userid resources: requests: cpu: "250m" - memory: "256Mi" + # We explicitly use Mi here instead of a decimal number of + # Gi because k8s will automatically convert to Mi which + # terraform then thinks differs from what it intended to apply. + memory: "1792Mi" limits: - cpu: "1" - memory: "512Mi" + cpu: "2" + memory: "2Gi" restartPolicy: OnFailure diff --git a/llvm-ops-metrics/ops-container/Dockerfile b/premerge/ops-container/Dockerfile similarity index 100% rename from llvm-ops-metrics/ops-container/Dockerfile rename to premerge/ops-container/Dockerfile diff --git a/premerge/ops-container/process_llvm_commits.py b/premerge/ops-container/process_llvm_commits.py new file mode 100644 index 000000000..028d3b0b6 --- /dev/null +++ b/premerge/ops-container/process_llvm_commits.py @@ -0,0 +1,281 @@ +import dataclasses +import datetime +import logging +import math +import os +import re +import git +from google.cloud import bigquery +import requests + +GITHUB_GRAPHQL_API_URL = "https://api.github.com/graphql" +REPOSITORY_URL = "https://github.com/llvm/llvm-project.git" + +# BigQuery dataset and tables to write metrics to. +OPERATIONAL_METRICS_DATASET = "operational_metrics" +LLVM_COMMITS_TABLE = "llvm_commits" + +# How many commits to query the GitHub GraphQL API for at a time. +# Querying too many commits at once often leads to the call failing. +GITHUB_API_BATCH_SIZE = 50 + +# Number of days to look back for new commits +# We allow some buffer time between when a commit is made and when it is queried +# for reviews. This is to allow time for any new GitHub events to propogate. +LOOKBACK_DAYS = 2 + +# Template GraphQL subquery to check if a commit has an associated pull request +# and whether that pull request has been reviewed and approved. +COMMIT_GRAPHQL_SUBQUERY_TEMPLATE = """ +commit_{commit_sha}: + object(oid:"{commit_sha}") {{ + ... on Commit {{ + author {{ + user {{ + login + }} + }} + associatedPullRequests(first: 1) {{ + totalCount + pullRequest: nodes {{ + number + reviewDecision + reviews(first: 10) {{ + nodes {{ + reviewer: author {{ + login + }} + }} + }} + }} + }} + }} + }} +""" + + +@dataclasses.dataclass +class LLVMCommitInfo: + commit_sha: str + commit_timestamp_seconds: int + diff: list[dict[str, int | str]] + commit_author: str = "" # GitHub username of author is unknown until API call + has_pull_request: bool = False + pull_request_number: int = 0 + is_reviewed: bool = False + is_approved: bool = False + reviewers: set[str] = dataclasses.field(default_factory=set) + is_revert: bool = False + pull_request_reverted: int | None = None + commit_reverted: str | None = None + + +def scrape_new_commits_by_date( + target_datetime: datetime.datetime, +) -> list[git.Commit]: + """Scrape new commits from a given dates. + + Args: + target_datetime: The date to scrape for new commits. + + Returns: + List of new commits made on the given date. + """ + # Clone repository to current working directory + repo = git.Repo.clone_from( + url=REPOSITORY_URL, + to_path="./llvm-project", + ) + + # Scrape for new commits + # iter_commits() yields commits in reverse chronological order + new_commits = [] + for commit in repo.iter_commits(): + # Skip commits that don't match the target date + committed_datetime = commit.committed_datetime.astimezone( + datetime.timezone.utc + ) + if committed_datetime.date() != target_datetime.date(): + continue + + new_commits.append(commit) + + logging.info("Found %d new commits", len(new_commits)) + return new_commits + + +def query_for_reviews( + new_commits: list[git.Commit], github_token: str +) -> list[LLVMCommitInfo]: + """Query GitHub GraphQL API for reviews of new commits. + + Args: + new_commits: List of new commits to query for reviews. + github_token: The access token to use with the GitHub GraphQL API. + + Returns: + List of LLVMCommitInfo objects for each commit's review information. + """ + # Create a map of commit sha to info + new_commits_info = {} + for commit in new_commits: + # Check if this commit is a revert + is_revert = ( + re.match( + r"^Revert \".*\"( \(#\d+\))?", commit.message, flags=re.IGNORECASE + ) + is not None + ) + + # Check which pull request or commit is being reverted (if any) + pull_request_match = re.search( + r"Reverts? (?:llvm\/llvm-project)?#(\d+)", commit.message, flags=re.IGNORECASE + ) + commit_match = re.search( + r"This reverts commit (\w+)", commit.message, flags=re.IGNORECASE + ) + pull_request_reverted = ( + int(pull_request_match.group(1)) if pull_request_match else None + ) + commit_reverted = commit_match.group(1) if commit_match else None + + # Add entry + new_commits_info[commit.hexsha] = LLVMCommitInfo( + commit_sha=commit.hexsha, + commit_timestamp_seconds=commit.committed_date, + diff=[ + { + "file": file, + "additions": line_stats["insertions"], + "deletions": line_stats["deletions"], + "total": line_stats["lines"], + } + for file, line_stats in commit.stats.files.items() + ], + is_revert=is_revert, + pull_request_reverted=pull_request_reverted, + commit_reverted=commit_reverted, + ) + + # Create GraphQL subqueries for each commit + commit_subqueries = [] + for commit_sha in new_commits_info: + commit_subqueries.append( + COMMIT_GRAPHQL_SUBQUERY_TEMPLATE.format(commit_sha=commit_sha) + ) + + api_commit_data = {} + query_template = """ + query { + repository(owner:"llvm", name:"llvm-project"){ + %s + } + } + """ + num_batches = math.ceil(len(commit_subqueries) / GITHUB_API_BATCH_SIZE) + logging.info("Querying GitHub GraphQL API in %d batches", num_batches) + for i in range(num_batches): + subquery_batch = commit_subqueries[ + i * GITHUB_API_BATCH_SIZE : (i + 1) * GITHUB_API_BATCH_SIZE + ] + query = query_template % "".join(subquery_batch) + + logging.info( + "Querying batch %d of %d (%d commits)", + i + 1, + num_batches, + len(subquery_batch), + ) + response = requests.post( + url=GITHUB_GRAPHQL_API_URL, + headers={ + "Authorization": f"bearer {github_token}", + }, + json={"query": query}, + ) + + # Exit if API call fails + # A failed API call means a large batch of data is missing and will not be + # reflected in the dashboard. The dashboard will silently misrepresent + # commit data if we continue execution, so it's better to fail loudly. + if response.status_code < 200 or response.status_code >= 300: + logging.error("Failed to query GitHub GraphQL API: %s", response.text) + exit(1) + + api_commit_data.update(response.json()["data"]["repository"]) + + # Amend commit information with GitHub data + for commit_sha, data in api_commit_data.items(): + commit_sha = commit_sha.removeprefix("commit_") + commit_info = new_commits_info[commit_sha] + commit_info.commit_author = data["author"]["user"]["login"] + + # If commit has no pull requests, skip it. No data to update. + if data["associatedPullRequests"]["totalCount"] == 0: + continue + + pull_request = data["associatedPullRequests"]["pullRequest"][0] + commit_info.has_pull_request = True + commit_info.pull_request_number = pull_request["number"] + commit_info.is_reviewed = pull_request["reviewDecision"] is not None + commit_info.is_approved = pull_request["reviewDecision"] == "APPROVED" + commit_info.reviewers = set([ + review["reviewer"]["login"] + for review in pull_request["reviews"]["nodes"] + ]) + + # There are cases where the commit author is counted as a reviewer. This is + # against what we want to measure, so remove them from the set of reviewers. + commit_info.reviewers.discard(commit_info.commit_author) + + return list(new_commits_info.values()) + + +def upload_daily_metrics_to_bigquery( + bq_client: bigquery.Client, new_commits: list[LLVMCommitInfo] +) -> None: + """Upload processed commit metrics to a BigQuery dataset. + + Args: + bq_client: The BigQuery client to use. + new_commits: List of commits to process & upload to BigQuery. + """ + table_ref = bq_client.dataset(OPERATIONAL_METRICS_DATASET).table( + LLVM_COMMITS_TABLE + ) + table = bq_client.get_table(table_ref) + commit_records = [dataclasses.asdict(commit) for commit in new_commits] + errors = bq_client.insert_rows(table, commit_records) + if errors: + logging.error("Failed to upload commit info to BigQuery: %s", errors) + exit(1) + + +def main() -> None: + github_token = os.environ["GITHUB_TOKEN"] + + # Scrape new commits + date_to_scrape = datetime.datetime.now( + datetime.timezone.utc + ) - datetime.timedelta(days=LOOKBACK_DAYS) + logging.info( + "Cloning and scraping llvm/llvm-project for new commits on %s", + date_to_scrape.strftime("%Y-%m-%d"), + ) + new_commits = scrape_new_commits_by_date(date_to_scrape) + if not new_commits: + logging.info("No new commits found. Exiting.") + return + + logging.info("Querying for reviews of new commits.") + new_commit_info = query_for_reviews(new_commits, github_token) + + logging.info("Uploading metrics to BigQuery.") + bq_client = bigquery.Client() + upload_daily_metrics_to_bigquery(bq_client, new_commit_info) + bq_client.close() + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + main() diff --git a/llvm-ops-metrics/ops-container/requirements.lock.txt b/premerge/ops-container/requirements.lock.txt similarity index 100% rename from llvm-ops-metrics/ops-container/requirements.lock.txt rename to premerge/ops-container/requirements.lock.txt diff --git a/llvm-ops-metrics/ops-container/requirements.txt b/premerge/ops-container/requirements.txt similarity index 100% rename from llvm-ops-metrics/ops-container/requirements.txt rename to premerge/ops-container/requirements.txt diff --git a/premerge/pod_disruption_budget.yaml b/premerge/pod_disruption_budget.yaml new file mode 100644 index 000000000..79bcaa623 --- /dev/null +++ b/premerge/pod_disruption_budget.yaml @@ -0,0 +1,10 @@ +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: runner-set-pdb + namespace: ${ runner_set_name } +spec: + minAvailable: ${ min_pod_count } + selector: + matchLabels: + actions.github.com/scale-set-name: ${ runner_set_name } diff --git a/premerge/post-submit-testing.md b/premerge/post-submit-testing.md new file mode 100644 index 000000000..04a4be1f9 --- /dev/null +++ b/premerge/post-submit-testing.md @@ -0,0 +1,203 @@ +# Post Submit Testing + +## Introduction + +While this infrastructure is focused on premerge testing, it is also important +to make sure that the specific configuration we are testing is tested post +commit as well. This document outlines the motivation for the need to test this +configuration post commit, how we plan on implementing this to ensure we get +fast feedback scalably, and why we are utilizing this design over others. + +## Background/Motivation + +LLVM has two types of testing upstream: premerge and postcommit. The premerge +testing is performed using Github Actions every time a pull request (PR) is +updated before it is merged. Premerge testing is performed using this +infrastructure (specifically the `./premerge` folder in llvm-zorg). Landing a PR +consists of squashing the changes into a single commit and adding that commit to +the `main` branch in the LLVM monorepo. We care specifically about the state of +the `main` branch because it is what the community considers to be the canonical +tree. Currently, commits can also be added to the `main` branch by directly +pushing to the main branch. Commits pushed directly to `main` are not tested +through the premerge pipeline as they skip the PR merge process. After a new +commit lands in the `main` branch, postcommit testing is performed. Most +postcommit testing is performed through the Buildbot infrastructure. The main +Buildbot instance for LLVM has a web instance hosted at +[lab.llvm.org](https://lab.llvm.org/buildbot/#/builders). When a new commit +lands in `main` the Buildbot instance (sometimes referred to as the Buildbot +master) will trigger many different builds, base on the configurations +defined in the llvm-zorg repository under the `buildbot/` folder. These +configurations are run on Buildbot workers that are hosted by the community. +Some builders build too slowly to keep up with the pace of commits to `main`, +so test batches of commits. This often results in a large number of +erroneous notifications due to the list of possible culprits for a breakage +being more than a single commit. + +For premerge testing, we do not want to notify LLVM developers about failures +already happening in `main` irrelevant to their changes. This requires knowing +the state of `main` at the time the premerge testing for a PR was started. We +also want information on the current state of `main` to empower the community +with information that they need to revert or forward-fix problematic commits. +Problematic commits can occur without being caught by the premerge system due to +someone directly pushing a commit to `main`, or if multiple PRs become +problematic only when combined. This means we need to test the premerge +configuration postcommit as well so that we can determine the state of `main` +(in terms of whether the build passed/failed and what tests failed, if any) at +any given point in time. We can use this data to implement a "premerge advisor" +that would prevent sending notifications about build/test failures not caused by +the changes in a user's PR. + +## Design + +The LLVM Premerge system has two clusters, namely the central cluster in the +Google Cloud Platform (GCP) zone `us-central1-a` and the west cluster in the GCP +zone `us-west1`. We run two clusters in different zones for redundancy so that +if one fails, we can still run jobs on the other cluster. For postcommit +testing, we plan on setting up builders attached to the Buildbot master +described above. We will run one builder on the central cluster and one in the +west cluster. This ensures the configuration is highly available (able to +tolerate an entire cluster going down), similar to the premerge testing. The +builders will be configured to use a script that will launch testing on each +commit to `main` as if it was being run through the premerge testing pipeline, with some small but significant differences. The post submit +testing is intended to be close to the premerge configuration. but will be +different in some key ways. The differences and motivation for them is described +more thoroughly in the [testing configuration](#testing-configuration) section. +These builds will be run inside containers that are distributed onto the cluster +inside kubernetes pods (the fundamental schedulable unit inside kubernetes). +This allows for kubernetes to handle details like what machine a build should +run on. Allowing kubernetes to handle these details also enables Google +Kubernetes Engine (GKE) to autoscale the node pools so we are not paying for +unneeded capacity. Launching builds inside pods also allows for each builder to +handle multiple builds at the same time. + +In terms of the full flow, any commit (which can be from direct pushes or +merging pull requests) pushed to the LLVM monorepo will get detected by the +buildbot master. The Buildbot master will invoke Buildbot workers running on our +clusters. These Buildbot workers will use custom builders to launch a build +wrapped in a kubernetes pod and report the results back to the buildbot master. +When the job is finished, the pod will complete and capacity will be available +for another build, or if there is nothing left to test GKE will see that there +is nothing running on one of the nodes and downscale the node pool. + +### Annotated Builder + +llvm-zorg has multiple types of builders. We plan on using an AnnotatedBuilder. +AnnotatedBuilders allow for the build to be driven using a custom python script +rather than directly dictating the shell commands that should be run to perform +the build. We need the flexibility of the AnnotatedBuilder to deploy jobs on the +cluster. AnnotatedBuilder based builders also enable deploying changes without +needing to restart the buildbot master. Without this, we have to wait for an +administrator of the LLVM buildbot master to restart it before our changes get +deployed. This could significantly delay updates or responses to incidents, +especially before the system is fully stable. + +### Build Distribution + +We want to be able to take advantage of the autoscaling functionality of the new +cluster to efficiently utilize resources. To do this, we plan on having the +AnnotatedBuilder script launch builds as kubernetes pods. This allows for +kubernetes to assign the builds to nodes and also allows autoscaling through the +same mechanism that Github Actions Runner Controller (ARC) uses to autoscale. +This enables us to quickly process builds at peak times and not pay for extra +capacity when commit traffic is quiet, ensuring our resource use is efficient +while still providing fast feedback. + +Using the kubernetes API inside of a python script (our AnnotatedBuilder +implementation) to launch builds does add some complexity. However, we belive +the additional complexity is justified as it allows us to achieve our goals +while maintaining efficient resource usage. + +### Testing Configuration + +By testing configuration, we mean both the environment that the tests run in, +and the set of tests that run. The testing configuration will be as close to the +premerge configuration as possible. We will be running all tests inside the same +container with the same scripts (the `monolithic-linux.sh` and +`monolithic-windows.sh` scripts) used by the premerge testing. However, there +will be one main difference between the premerge and postcommit testing +configurations. In the postcommit configuration we propose testing all projects +on every commit rather than only testing the projects that themselves changed or +had dependencies that changed. We propose this for two main reasons. Firstly, +Buildbot does not have good support for heterogenous build configurations. This +means that testing a different set of projects within a single builder depending +upon the contents of the commit could easily cause problems. More notifications +could be produced if certain projects (that were only triggered by some files) +were failing and some were passing which would significantly increase false +positive notifications. For example, supposed that we have three commits that +land in `main` and run through postcommit testing: commit A that touches MLIR, +commit B that touches clang-tidy, and commit C that modifies MLIR. Commit A +lands, then commit B, then commit C. If commit A introduces MLIR test failures +into an otherwise clean slate, we would see the following events: + +1. Commit A lands. Because it touches MLIR, the buildbot worker runs the MLIR + tests. Some of the tests fail. The buildbot "turns red" and a notification is + sent out to the PR author. +2. Commit B lands. Since it touches clang-tidy, the buildbot worker runs the + clang-tidy tests. All of the tests pass. The buildbot "turns green". No + notifications are sent out since everything is passing. +3. Commit C lands. Since it touches MLIR, the buildbot workers runs the MLIR + tests. The problem introduced in commit A still exists, so some tests fail. + No new tests fail. Since the buildbot was previously green due to the + interspersed clang-tidy commit, a notification is still sent out to the + author of commit C. + +By running the tests for all projects in every postsubmit test run, we avoid +the problematic situation described above. + +Another reason for running all the tests in every postsubmit run: When running +premerge tests on a PR, we also explicitly do not test certain projects even +though their dependencies change. While we do this because we suspect +interactions resulting in test failures would be quite rare, it is possible, and +having a postcommit configuration catch these rare failures would be useful. + +### Data Storage + +The hosted Buildbot master instance at [lab.llvm.org](https://lab.llvm.org) +contains results for all recent postcommit runs. We plan on querying the results +from the buildbot master because they are already available and that is where +they will natively be reported after the infrastructure is set up. Buildbot +supports a [REST API](https://docs.buildbot.net/latest/developer/rest.html) that +would allow for easily querying the state of a commit in `main`. + +In the future, we may implement a "premerge advisor" that tells the user what +tests/build failures they can safely ignore, we need to know what is currently +failing on `main`. Each pull request is tested as if it was merged into main, +which means the commit underneath the PR is very recent. If a premerge run +fails, the premerge advisor will find the commit from `main` the PR is being +tested on. It will then query the Buildbot master using the REST API for the +status of that commit, or the preceeding commits if testing for the requested +commit has not yet completed. It can then report the appropriate status to the +user. Having the status will let the premerge advisor avoid pestering LLVM +developers with failures unrelated to their changes. + +## Alternatives Considered + +Originally, we were looking at running postcommit testing through Github +Actions, like the premerge tests. This is primarily due to it being easy to +implement (a single line change in the Github Actions workflow config) and also +easy to integrate with the Github API for implementation of the premerge testing +advisor. More detailed motivation for the doing postcommit testing directly +through Github is available in the +[discourse RFC thread](https://discourse.llvm.org/t/rfc-running-premerge-postcommit-through-github-actions/86124) +where we proposed doing this. We eventually decided against implementation in +this way for a couple of reasons: + +1. Nonstandard - The standard postcommit testing infrastructure for LLVM is + through Buildbot. Doing postcommit testing for the premerge configuration + through Github would represent a significant departure from this. This means + we are leaving behind some common infrastructure and are also forcing a new + unfamiliar postcommit interface on LLVM contributors. +2. Notifications - This is the biggest issue. Github currently gives very little + control over the notifications that are sent out when the build fails or gets + cancelled. This is specifically a problem with Github sending out + notifications for build failures even if the previous build has failed. This + can easily create a lot of warning fatigue which is something we are putting + a lot of effort in to avoid. We want the premerge system to be perceived as + reliable, have people trust its results, and most importantly, have people + pay attention to failures when they do occur. They are significantly more + likely to pay attention when they are the author of the patch getting the + notification and the feedback is actionable. +3. Customization - Buildbot can be customized around issues like notifications + whereas Github cannot. Github is not particularly responsive on feature + requests and their notification story has been poor for a while, so their + lack of customization is a strategic risk. diff --git a/premerge/premerge_resources/main.tf b/premerge/premerge_resources/main.tf index 10b020fa6..5655bdcbb 100644 --- a/premerge/premerge_resources/main.tf +++ b/premerge/premerge_resources/main.tf @@ -47,6 +47,51 @@ resource "kubernetes_namespace" "llvm_premerge_windows_2022_runners" { } } +# Buildbot here refers specifically to the LLVM Buildbot postcommit +# testing infrastructure. These machines are used specifically for testing +# commits after they have landed in main. +resource "kubernetes_namespace" "llvm_premerge_linux_buildbot" { + metadata { + name = "llvm-premerge-linux-buildbot" + } +} + +resource "kubernetes_namespace" "llvm_premerge_windows_2022_buildbot" { + metadata { + name = "llvm-premerge-windows-2022-buildbot" + } +} + +resource "kubernetes_secret" "linux_buildbot_password" { + metadata { + name = "linux-buildbot-password" + namespace = "llvm-premerge-linux-buildbot" + } + + data = { + "password" = var.linux_buildbot_password + } + + type = "Opaque" + + depends_on = [kubernetes_namespace.llvm_premerge_linux_buildbot] +} + +resource "kubernetes_secret" "windows_2022_buildbot_password" { + metadata { + name = "windows-buildbot-password" + namespace = "llvm-premerge-windows-2022-buildbot" + } + + data = { + "password" = var.windows_buildbot_password + } + + type = "Opaque" + + depends_on = [kubernetes_namespace.llvm_premerge_windows_2022_buildbot] +} + resource "kubernetes_secret" "linux_github_pat" { metadata { name = "github-token" @@ -180,6 +225,11 @@ resource "helm_release" "github_actions_runner_set_windows_2022" { ] } +# TODO(boomanaiden154): We have to customize the command for the libcxx runner +# containers because the file path has changed between the sets. Remove this +# workaround once all of the runner sets have the runner binary in the same +# path. + resource "helm_release" "github_actions_runner_set_libcxx" { name = "llvm-premerge-libcxx-runners" namespace = "llvm-premerge-libcxx-runners" @@ -188,7 +238,7 @@ resource "helm_release" "github_actions_runner_set_libcxx" { chart = "gha-runner-scale-set" values = [ - "${templatefile("libcxx_runners_values.yaml", { runner_group_name : var.runner_group_name, runner_image : var.libcxx_runner_image })}" + "${templatefile("libcxx_runners_values.yaml", { runner_group_name : var.runner_group_name, runner_image : var.libcxx_runner_image, command : "/home/runner/run.sh" })}" ] depends_on = [ @@ -206,7 +256,7 @@ resource "helm_release" "github_actions_runner_set_libcxx_release" { chart = "gha-runner-scale-set" values = [ - "${templatefile("libcxx_runners_values.yaml", { runner_group_name : var.runner_group_name, runner_image : var.libcxx_release_runner_image })}" + "${templatefile("libcxx_runners_values.yaml", { runner_group_name : var.runner_group_name, runner_image : var.libcxx_release_runner_image, command : "/home/runner/run.sh" })}" ] depends_on = [ @@ -224,7 +274,7 @@ resource "helm_release" "github_actions_runner_set_libcxx_next" { chart = "gha-runner-scale-set" values = [ - "${templatefile("libcxx_runners_values.yaml", { runner_group_name : var.runner_group_name, runner_image : var.libcxx_next_runner_image })}" + "${templatefile("libcxx_runners_values.yaml", { runner_group_name : var.runner_group_name, runner_image : var.libcxx_next_runner_image, command : "/home/gha/actions-runner/run.sh" })}" ] depends_on = [ @@ -234,6 +284,156 @@ resource "helm_release" "github_actions_runner_set_libcxx_next" { ] } +resource "kubernetes_role" "linux_buildbot_role" { + metadata { + name = "buildbot-role" + namespace = "llvm-premerge-linux-buildbot" + } + + rule { + api_groups = [""] + resources = ["pods", "pods/log", "pods/status"] + verbs = ["create", "delete", "get"] + } + + depends_on = [kubernetes_namespace.llvm_premerge_linux_buildbot] +} + +resource "kubernetes_service_account" "linux_buildbot_ksa" { + metadata { + name = "buildbot-ksa" + namespace = "llvm-premerge-linux-buildbot" + } + + depends_on = [kubernetes_namespace.llvm_premerge_linux_buildbot] +} + +resource "kubernetes_role_binding" "linux_buildbot_role_binding" { + metadata { + name = "buildbot-role-binding" + namespace = "llvm-premerge-linux-buildbot" + } + + role_ref { + kind = "Role" + name = "buildbot-role" + api_group = "rbac.authorization.k8s.io" + } + + subject { + kind = "ServiceAccount" + name = "buildbot-ksa" + namespace = "llvm-premerge-linux-buildbot" + } + + depends_on = [kubernetes_role.linux_buildbot_role, kubernetes_service_account.linux_buildbot_ksa] +} + +resource "kubernetes_service_account" "linux_buildbot_gcs_ksa" { + metadata { + name = "buildbot-gcs-ksa" + namespace = "llvm-premerge-linux-buildbot" + annotations = { + "iam.gke.io/gcp-service-account" = var.linux_object_cache_buildbot_service_account_email + } + } + + depends_on = [kubernetes_namespace.llvm_premerge_linux_buildbot] +} + +resource "kubernetes_manifest" "linux_buildbot_b1_deployment" { + manifest = yamldecode(templatefile("buildbot_deployment.yaml", { buildbot_name : format("%s-b1", var.linux_buildbot_name_template), buildbot_namespace : "llvm-premerge-linux-buildbot", secret_name : "linux-buildbot-password", buildbot_region : var.cluster_name })) + + depends_on = [kubernetes_namespace.llvm_premerge_linux_buildbot, kubernetes_secret.linux_buildbot_password] +} + +resource "kubernetes_manifest" "linux_buildbot_b2_deployment" { + manifest = yamldecode(templatefile("buildbot_deployment.yaml", { buildbot_name : format("%s-b2", var.linux_buildbot_name_template), buildbot_namespace : "llvm-premerge-linux-buildbot", secret_name : "linux-buildbot-password", buildbot_region : var.cluster_name })) + + depends_on = [kubernetes_namespace.llvm_premerge_linux_buildbot, kubernetes_secret.linux_buildbot_password] +} + +resource "kubernetes_manifest" "linux_buildbot_b3_deployment" { + manifest = yamldecode(templatefile("buildbot_deployment.yaml", { buildbot_name : format("%s-b3", var.linux_buildbot_name_template), buildbot_namespace : "llvm-premerge-linux-buildbot", secret_name : "linux-buildbot-password", buildbot_region : var.cluster_name })) + + depends_on = [kubernetes_namespace.llvm_premerge_linux_buildbot, kubernetes_secret.linux_buildbot_password] +} + +resource "kubernetes_role" "windows_2022_buildbot_role" { + metadata { + name = "buildbot-role" + namespace = "llvm-premerge-windows-2022-buildbot" + } + + rule { + api_groups = [""] + resources = ["pods", "pods/log", "pods/status"] + verbs = ["create", "delete", "get"] + } + + depends_on = [kubernetes_namespace.llvm_premerge_windows_2022_buildbot] +} + +resource "kubernetes_service_account" "windows_2022_buildbot_ksa" { + metadata { + name = "buildbot-ksa" + namespace = "llvm-premerge-windows-2022-buildbot" + } + + depends_on = [kubernetes_namespace.llvm_premerge_windows_2022_buildbot] +} + +resource "kubernetes_role_binding" "windows_2022_buildbot_role_binding" { + metadata { + name = "buildbot-role-binding" + namespace = "llvm-premerge-windows-2022-buildbot" + } + + role_ref { + kind = "Role" + name = "buildbot-role" + api_group = "rbac.authorization.k8s.io" + } + + subject { + kind = "ServiceAccount" + name = "buildbot-ksa" + namespace = "llvm-premerge-windows-2022-buildbot" + } + + depends_on = [kubernetes_role.windows_2022_buildbot_role, kubernetes_service_account.windows_2022_buildbot_ksa] +} + +resource "kubernetes_service_account" "windows_2022_buildbot_gcs_ksa" { + metadata { + name = "buildbot-gcs-ksa" + namespace = "llvm-premerge-windows-2022-buildbot" + annotations = { + "iam.gke.io/gcp-service-account" = var.windows_2022_object_cache_buildbot_service_account_email + } + } + + depends_on = [kubernetes_namespace.llvm_premerge_windows_2022_buildbot] +} + +resource "kubernetes_manifest" "windows_buildbot_b1_deployment" { + manifest = yamldecode(templatefile("buildbot_deployment.yaml", { buildbot_name : format("%s-b1", var.windows_buildbot_name_template), buildbot_namespace : "llvm-premerge-windows-2022-buildbot", secret_name : "windows-buildbot-password", buildbot_region : var.cluster_name })) + + depends_on = [kubernetes_namespace.llvm_premerge_windows_2022_buildbot, kubernetes_secret.windows_2022_buildbot_password] +} + +resource "kubernetes_manifest" "windows_buildbot_b2_deployment" { + manifest = yamldecode(templatefile("buildbot_deployment.yaml", { buildbot_name : format("%s-b2", var.windows_buildbot_name_template), buildbot_namespace : "llvm-premerge-windows-2022-buildbot", secret_name : "windows-buildbot-password", buildbot_region : var.cluster_name })) + + depends_on = [kubernetes_namespace.llvm_premerge_windows_2022_buildbot, kubernetes_secret.windows_2022_buildbot_password] +} + +resource "kubernetes_manifest" "windows_buildbot_b3_deployment" { + manifest = yamldecode(templatefile("buildbot_deployment.yaml", { buildbot_name : format("%s-b3", var.windows_buildbot_name_template), buildbot_namespace : "llvm-premerge-windows-2022-buildbot", secret_name : "windows-buildbot-password", buildbot_region : var.cluster_name })) + + depends_on = [kubernetes_namespace.llvm_premerge_windows_2022_buildbot, kubernetes_secret.windows_2022_buildbot_password] +} + resource "kubernetes_service_account" "linux_object_cache_ksa" { metadata { name = var.linux_runners_kubernetes_service_account_name @@ -258,6 +458,36 @@ resource "kubernetes_service_account" "windows_2022_object_cache_ksa" { depends_on = [kubernetes_namespace.llvm_premerge_windows_2022_runners] } +# We set up pod disruption budgets here. We need one per namespace and we need +# to set the min pod count to the maximum number of runner pods that can +# possibly exist so we never have a number of disruptible pods greater than +# zero. + +resource "kubernetes_manifest" "linux_runners_disruption_budget" { + manifest = yamldecode(templatefile("pod_disruption_budget.yaml", { runner_set_name : "llvm-premerge-linux-runners", min_pod_count : 16 })) + depends_on = [kubernetes_namespace.llvm_premerge_linux_runners] +} + +resource "kubernetes_manifest" "windows_2022_runners_disruption_budget" { + manifest = yamldecode(templatefile("pod_disruption_budget.yaml", { runner_set_name : "llvm-premerge-windows-2022-runners", min_pod_count : 16 })) + depends_on = [kubernetes_namespace.llvm_premerge_linux_runners] +} + +resource "kubernetes_manifest" "libcxx_runners_disruption_budget" { + manifest = yamldecode(templatefile("pod_disruption_budget.yaml", { runner_set_name : "llvm-premerge-libcxx-runners", min_pod_count : 32 })) + depends_on = [kubernetes_namespace.llvm_premerge_linux_runners] +} + +resource "kubernetes_manifest" "libcxx_release_runners_disruption_budget" { + manifest = yamldecode(templatefile("pod_disruption_budget.yaml", { runner_set_name : "llvm-premerge-libcxx-release-runners", min_pod_count : 32 })) + depends_on = [kubernetes_namespace.llvm_premerge_linux_runners] +} + +resource "kubernetes_manifest" "libcxx_next_runners_disruption_budget" { + manifest = yamldecode(templatefile("pod_disruption_budget.yaml", { runner_set_name : "llvm-premerge-libcxx-next-runners", min_pod_count : 32 })) + depends_on = [kubernetes_namespace.llvm_premerge_linux_runners] +} + resource "kubernetes_namespace" "grafana" { metadata { name = "grafana" diff --git a/premerge/premerge_resources/variables.tf b/premerge/premerge_resources/variables.tf index 210c711fa..eae2394fd 100644 --- a/premerge/premerge_resources/variables.tf +++ b/premerge/premerge_resources/variables.tf @@ -61,7 +61,7 @@ variable "runner_group_name" { variable "libcxx_runner_image" { type = string - default = "ghcr.io/llvm/libcxx-linux-builder:16f046281bf1a11d344eac1bc44d11f3e50e3b5d" + default = "ghcr.io/llvm/libcxx-linux-builder:36d31b0c008b2716329b5c9990f583decf919819" } variable "libcxx_release_runner_image" { @@ -69,10 +69,9 @@ variable "libcxx_release_runner_image" { default = "ghcr.io/llvm/libcxx-linux-builder:16f046281bf1a11d344eac1bc44d11f3e50e3b5d" } -# Same value as libcxx_runner_image at this time. variable "libcxx_next_runner_image" { type = string - default = "ghcr.io/llvm/libcxx-linux-builder:16f046281bf1a11d344eac1bc44d11f3e50e3b5d" + default = "ghcr.io/llvm/libcxx-linux-builder:36d31b0c008b2716329b5c9990f583decf919819" } variable "linux_runners_namespace_name" { @@ -104,3 +103,33 @@ variable "windows_2022_object_cache_gcp_service_account_email" { description = "The email associated with the service account for accessing the object cache on Windows." type = string } + +variable "linux_buildbot_name_template" { + description = "The name of the linux buildbot that will run tests postcommit." + type = string +} + +variable "linux_buildbot_password" { + description = "The password for the linux buildbot that will run tests postcommit." + type = string +} + +variable "windows_buildbot_name_template" { + description = "The name of the windows buildbot that will run tests postcommit." + type = string +} + +variable "windows_buildbot_password" { + description = "The password for the windows buildbot that will run tests postcommit." + type = string +} + +variable "linux_object_cache_buildbot_service_account_email" { + description = "The email associated with the service account for the buildbot worker accessing the object cache on Linux." + type = string +} + +variable "windows_2022_object_cache_buildbot_service_account_email" { + description = "The email associated with the service account for the buildbot worker accessing the object cache on Windows." + type = string +} diff --git a/zorg/buildbot/builders/ClangBuilder.py b/zorg/buildbot/builders/ClangBuilder.py index ff5d91bf7..9b3157d76 100644 --- a/zorg/buildbot/builders/ClangBuilder.py +++ b/zorg/buildbot/builders/ClangBuilder.py @@ -533,17 +533,19 @@ def _getClangCMakeBuildFactory( env=env)) # Get generated python, lnt - python = util.Interpolate('%(prop:builddir)s/test/sandbox/bin/python') - lnt = util.Interpolate('%(prop:builddir)s/test/sandbox/bin/lnt') - lnt_setup = util.Interpolate('%(prop:builddir)s/test/lnt/setup.py') + virtualenv_dir = 'Scripts' if vs else 'bin' + python = InterpolateToPosixPath(f'%(prop:builddir)s/test/sandbox/{virtualenv_dir}/python') + lnt_ext = '.exe' if vs else '' + lnt = InterpolateToPosixPath(f'%(prop:builddir)s/test/sandbox/{virtualenv_dir}/lnt{lnt_ext}') + lnt_setup = InterpolateToPosixPath('%(prop:builddir)s/test/lnt/setup.py') # Paths - sandbox = util.Interpolate('%(prop:builddir)s/test/sandbox') - test_suite_dir = util.Interpolate('%(prop:builddir)s/test/test-suite') + sandbox = InterpolateToPosixPath('%(prop:builddir)s/test/sandbox') + test_suite_dir = InterpolateToPosixPath('%(prop:builddir)s/test/test-suite') # Get latest built Clang (stage1 or stage2) - cc = util.Interpolate(f'%(prop:builddir)s/{compiler_path}/bin/{cc}') - cxx = util.Interpolate(f'%(prop:builddir)s/{compiler_path}/bin/{cxx}') + cc = InterpolateToPosixPath(f'%(prop:builddir)s/{compiler_path}/bin/{cc}') + cxx = InterpolateToPosixPath(f'%(prop:builddir)s/{compiler_path}/bin/{cxx}') # LNT Command line (don't pass -jN. Users need to pass both --threads # and --build-threads in nt_flags/test_suite_flags to get the same effect) @@ -558,7 +560,7 @@ def _getClangCMakeBuildFactory( # Append any option provided by the user test_suite_cmd.extend(nt_flags) else: - lit = util.Interpolate(f'%(prop:builddir)s/{stage1_build}/bin/llvm-lit') + lit = InterpolateToPosixPath(f'%(prop:builddir)s/{stage1_build}/bin/llvm-lit') test_suite_cmd = [python, lnt, 'runtest', 'test-suite', '--no-timestamp', '--sandbox', sandbox, @@ -567,7 +569,7 @@ def _getClangCMakeBuildFactory( '--cxx', cxx, '--use-lit', lit, # Carry on building even if there is a failure. - '--build-tool-options', '"-k"'] + '--build-tool-options', '"-k 0"' if '--use-make=ninja' in testsuite_flags else '"-k"'] # Enable fortran if flang is checked out if checkout_flang: fortran_flags = [ diff --git a/zorg/buildbot/builders/DebugifyBuilder.py b/zorg/buildbot/builders/DebugifyBuilder.py new file mode 100644 index 000000000..0e838ab6e --- /dev/null +++ b/zorg/buildbot/builders/DebugifyBuilder.py @@ -0,0 +1,99 @@ +from buildbot.plugins import util +from buildbot.steps.shell import ShellCommand +from zorg.buildbot.builders import TestSuiteBuilder +from zorg.buildbot.builders.TestSuiteBuilder import test_suite_build_path +from zorg.buildbot.commands.CmakeCommand import CmakeCommand + + +def addCheckDebugifyStep(f, debugify_output_path, compiler_dir=".", env={}): + script = util.Interpolate( + f"%(prop:builddir)s/{compiler_dir}/llvm/utils/llvm-original-di-preservation.py" + ) + f.addStep( + ShellCommand( + name="check debugify output", + command=[ + "python3", + script, + util.Interpolate(debugify_output_path), + "--acceptance-test", + "--reduce", + ], + description="check debugify output", + env=env, + ) + ) + + +def getDebugifyBuildFactory( + depends_on_projects=None, + enable_runtimes="auto", + targets=None, + llvm_srcdir=None, + obj_dir=None, + checks=None, + install_dir=None, + clean=False, + test_suite_build_flags="-O2 -g -DNDEBUG", + extra_configure_args=None, + enable_origin_tracking=True, + extra_test_suite_configure_args=None, + env={}, + **kwargs, +): + + # Make a local copy of the LLVM configure args, as we are going to modify that. + if extra_configure_args is not None: + llvm_cmake_args = extra_configure_args[:] + else: + llvm_cmake_args = list() + + tracking_mode = "COVERAGE_AND_ORIGIN" if enable_origin_tracking else "COVERAGE" + CmakeCommand.applyRequiredOptions(llvm_cmake_args, [ + ('-DLLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING=', tracking_mode) + ]) + + # This path will be passed through to util.Interpolate, so we leave it in this format. + # NB: This must be stored in the test suite build directory, as that is the only way to ensure that it is + # unconditionally up before (and not after) each run. + debugify_output_path = f"%(prop:builddir)s/{test_suite_build_path}/debugify-report.json" + + # Make a local copy of the test suite configure args, as we are going to modify that. + if extra_test_suite_configure_args is not None: + test_suite_cmake_args = extra_test_suite_configure_args[:] + else: + test_suite_cmake_args = list() + + CmakeCommand.applyDefaultOptions(test_suite_cmake_args, [ + ('-DTEST_SUITE_SUBDIRS=', 'CTMark'), + ('-DTEST_SUITE_RUN_BENCHMARKS=', 'false'), + ('-DTEST_SUITE_COLLECT_CODE_SIZE=', 'false'), + ]) + # The only configuration that currently makes sense for Debugify builds is optimized debug info builds; any build + # configuration adjustments can be made through the test_suite_build_flags arg. + build_flags = f'{test_suite_build_flags} -Xclang -fverify-debuginfo-preserve -Xclang -fverify-debuginfo-preserve-export={debugify_output_path} -mllvm --debugify-quiet -mllvm -debugify-level=locations' + CmakeCommand.applyRequiredOptions(test_suite_cmake_args, [ + ('-DCMAKE_BUILD_TYPE=', 'RelWithDebInfo'), + ]) + test_suite_cmake_args += [ + util.Interpolate(f"-DCMAKE_C_FLAGS_RELWITHDEBINFO={build_flags}"), + util.Interpolate(f"-DCMAKE_CXX_FLAGS_RELWITHDEBINFO={build_flags}"), + ] + + f = TestSuiteBuilder.getTestSuiteBuildFactory( + depends_on_projects=depends_on_projects, + enable_runtimes=enable_runtimes, + targets=targets, + llvm_srcdir=llvm_srcdir, + obj_dir=obj_dir, + checks=checks, + install_dir=install_dir, + clean=clean, + extra_configure_args=llvm_cmake_args, + extra_test_suite_configure_args=test_suite_cmake_args, + **kwargs + ) + + addCheckDebugifyStep(f, debugify_output_path, compiler_dir=f.monorepo_dir, env=env) + + return f diff --git a/zorg/buildbot/builders/TestSuiteBuilder.py b/zorg/buildbot/builders/TestSuiteBuilder.py index fb9c3bb49..86753bd43 100644 --- a/zorg/buildbot/builders/TestSuiteBuilder.py +++ b/zorg/buildbot/builders/TestSuiteBuilder.py @@ -8,6 +8,9 @@ from zorg.buildbot.commands.NinjaCommand import NinjaCommand from zorg.buildbot.commands.LitTestCommand import LitTestCommand +# The DebugifyBuilder needs to know the test-suite build directory, so we share the build directory via this variable. +test_suite_build_path = 'test/build-test-suite' + # This builder is uses UnifiedTreeBuilders and adds running # llvm-test-suite with cmake and ninja step. @@ -16,6 +19,7 @@ def addTestSuiteStep( compiler_dir = '.', env = None, lit_args = None, + extra_configure_args = None, **kwargs): # Set defaults @@ -24,15 +28,19 @@ def addTestSuiteStep( if lit_args is None: lit_args = [] - cc = util.Interpolate('-DCMAKE_C_COMPILER=' + '%(prop:builddir)s/'+compiler_dir+'/bin/clang') - cxx = util.Interpolate('-DCMAKE_CXX_COMPILER=' + '%(prop:builddir)s/'+compiler_dir+'/bin/clang++') + cc = util.Interpolate('-DCMAKE_C_COMPILER=%(prop:builddir)s/'+compiler_dir+'/bin/clang') + cxx = util.Interpolate('-DCMAKE_CXX_COMPILER=%(prop:builddir)s/'+compiler_dir+'/bin/clang++') lit = util.Interpolate('%(prop:builddir)s/' + compiler_dir + '/bin/llvm-lit') test_suite_base_dir = util.Interpolate('%(prop:builddir)s/' + 'test') test_suite_src_dir = util.Interpolate('%(prop:builddir)s/' + 'test/test-suite') - test_suite_workdir = util.Interpolate('%(prop:builddir)s/' + 'test/build-test-suite') - cmake_lit_arg = util.Interpolate('-DTEST_SUITE_LIT:FILEPATH=' + '%(prop:builddir)s/' + compiler_dir + '/bin/llvm-lit') + test_suite_workdir = util.Interpolate('%(prop:builddir)s/' + test_suite_build_path) + cmake_lit_arg = util.Interpolate('-DTEST_SUITE_LIT:FILEPATH=%(prop:builddir)s/' + compiler_dir + '/bin/llvm-lit') # used for cmake building test-suite step - options = [cc, cxx, cmake_lit_arg] + if extra_configure_args is not None: + cmake_args = extra_configure_args[:] + else: + cmake_args = list() + cmake_args.extend([cc, cxx, cmake_lit_arg]) # always clobber the build directory to test each new compiler f.addStep(ShellCommand(name='Clean Test Suite Build dir', @@ -51,7 +59,7 @@ def addTestSuiteStep( haltOnFailure=True, description='Running cmake on Test Suite dir', workdir=test_suite_workdir, - options=options, + options=cmake_args, path=test_suite_src_dir, generator='Ninja')) @@ -80,6 +88,7 @@ def getTestSuiteBuildFactory( install_dir = None, clean = False, extra_configure_args = None, + extra_test_suite_configure_args = None, env = None, **kwargs): @@ -109,6 +118,7 @@ def getTestSuiteBuildFactory( compiler_dir=f.obj_dir, env=env, lit_args=lit_args, + extra_configure_args=extra_test_suite_configure_args, **kwargs) return f diff --git a/zorg/buildbot/builders/UnifiedTreeBuilder.py b/zorg/buildbot/builders/UnifiedTreeBuilder.py index 5bc79819b..ce921b7e8 100644 --- a/zorg/buildbot/builders/UnifiedTreeBuilder.py +++ b/zorg/buildbot/builders/UnifiedTreeBuilder.py @@ -612,6 +612,7 @@ def getCmakeWithNinjaMultistageBuildFactory( def getCmakeExBuildFactory( depends_on_projects = None, + enable_projects = "auto", enable_runtimes = "auto", cmake_definitions = None, cmake_options = None, @@ -666,6 +667,17 @@ def getCmakeExBuildFactory( If this parameter is not None and contains the non-runtime project names, they will go to LLVM_ENABLE_PROJECTS CMake configuration parameter. + enable_projects : list, optional + A list of the LLVM projects (except the runtime projects) for the build (default is 'auto'). + This list goes into the factory's 'enable_projects' attribute and LLVM_ENABLE_PROJECTS CMake configuration + parameter. + + If "auto" is specified, the runtime projects will be extracted from 'depends_on_projects' parameter. + + If None is specified, LLVM_ENABLE_PROJECTS will not be set for the CMake configuration step. + + (see LLVMBuildFactory for more details). + enable_runtimes : list, optional A list of the runtime project names for the build (default is 'auto'). This list goes into the factory's 'enable_runtimes' attribute and LLVM_ENABLE_RUNTIMES CMake configuration parameter. @@ -894,6 +906,7 @@ def norm_target_list_arg(lst): # Default root factory. We will collect all steps for all stages here. f = LLVMBuildFactory( depends_on_projects = depends_on_projects, + enable_projects = enable_projects, enable_runtimes = enable_runtimes, hint = hint, llvm_srcdir = llvm_srcdir, diff --git a/zorg/buildbot/builders/annotated/hip-build.sh b/zorg/buildbot/builders/annotated/hip-build.sh index c57ca4b4d..83b4b095e 100755 --- a/zorg/buildbot/builders/annotated/hip-build.sh +++ b/zorg/buildbot/builders/annotated/hip-build.sh @@ -52,7 +52,7 @@ if [ ! -d "${LLVM_ROOT}" ]; then fi build_step "Updating llvm-project repo" -git -C "${LLVM_ROOT}" fetch origin +git -C "${LLVM_ROOT}" fetch --prune origin git -C "${LLVM_ROOT}" reset --hard "${LLVM_REVISION}" } @@ -64,13 +64,19 @@ if [ ! -d "${TESTSUITE_ROOT}" ]; then fi build_step "Updating llvm-test-suite repo" -git -C "${TESTSUITE_ROOT}" fetch origin +git -C "${TESTSUITE_ROOT}" fetch --prune origin git -C "${TESTSUITE_ROOT}" reset --hard origin/main } # Start building LLVM, Clang, Lld, clang-tools-extra, compiler-rt build_llvm() { build_step "Configure LLVM Build" + +# Nuke the build dir to start from a cleaner state and rely on ccache for build time +if [ -d "${LLVM_BUILD_DIR}" ]; then + rm -rf "${LLVM_BUILD_DIR}" +fi + mkdir -p "${LLVM_BUILD_DIR}" cd "${LLVM_BUILD_DIR}" cmake -G Ninja \ @@ -136,4 +142,3 @@ update_test_suite build_test_suite exit 0 - diff --git a/zorg/buildbot/builders/annotated/libc-linux.py b/zorg/buildbot/builders/annotated/libc-linux.py index 0349a3918..87171a88a 100644 --- a/zorg/buildbot/builders/annotated/libc-linux.py +++ b/zorg/buildbot/builders/annotated/libc-linux.py @@ -121,7 +121,12 @@ def main(argv): if arm32_build and qemu_build: cmake_args.append('-DLIBC_TARGET_TRIPLE=arm-linux-gnueabihf') - cmake_args.append('-DLIBC_TEST_COMPILE_OPTIONS_DEFAULT=-static') + cmake_args.append('-DCMAKE_SYSROOT=/opt/sysroot-deb-armhf-stable') + cmake_args.append('-DCMAKE_C_COMPILER_TARGET=arm-linux-gnueabihf') + cmake_args.append('-DCMAKE_CXX_COMPILER_TARGET=arm-linux-gnueabihf') + cmake_args.append('-DCMAKE_AR=/usr/bin/llvm-ar-20') + cmake_args.append('-DCMAKE_RANLIB=/usr/bin/llvm-ranlib-20') + cmake_args.append('-DLIBC_UNITTEST_ENV=QEMU_LD_PREFIX=/opt/sysroot-deb-armhf-stable') if bootstrap_build: cmake_root = 'llvm' diff --git a/zorg/buildbot/builders/annotated/libc-windows.py b/zorg/buildbot/builders/annotated/libc-windows.py index 5d83348c5..7f7bf4aa5 100644 --- a/zorg/buildbot/builders/annotated/libc-windows.py +++ b/zorg/buildbot/builders/annotated/libc-windows.py @@ -39,17 +39,12 @@ def main(argv): if args.asan: cmake_args.append('-DLLVM_USE_SANITIZER=Address') - cmake_args.append('-DLLVM_ENABLE_PROJECTS=libc') - cmake_args.append('-DLLVM_TARGETS_TO_BUILD=X86') - cmake_args.append('-DLLVM_FORCE_BUILD_RUNTIME=libc') - cmake_args.append('-DLLVM_NATIVE_ARCH=x86_64') - cmake_args.append('-DLLVM_HOST_TRIPLE=x86_64-window-x86-gnu') - cmake_args.append('-DLLVM_LIBC_MPFR_INSTALL_PATH=C:/src/install') + cmake_args.append('-DLLVM_ENABLE_RUNTIMES=libc') - run_command(['cmake', os.path.join(source_dir, 'llvm')] + cmake_args) + run_command(['cmake', os.path.join(source_dir, 'runtimes')] + cmake_args) with step('build llvmlibc', halt_on_fail=True): - run_command(['ninja', 'llvmlibc']) + run_command(['ninja', 'libc']) with step('check-libc'): run_command(['ninja', 'check-libc']) diff --git a/zorg/buildbot/builders/annotated/premerge/dispatch_job.py b/zorg/buildbot/builders/annotated/premerge/dispatch_job.py new file mode 100644 index 000000000..994788f47 --- /dev/null +++ b/zorg/buildbot/builders/annotated/premerge/dispatch_job.py @@ -0,0 +1,301 @@ +"""Dispatches a job to the k8s cluster. + +This script takes in a commit SHA to test along with the platform, spawns a job +to test it, and then streams the logs from the job. We read logs from the job +every so often using the kuberntes logging API rather than directly executing +commands inside the container and streaming the output. This is to work +around https://github.com/kubernetes-sigs/apiserver-network-proxy/issues/748. +""" + +import sys +import logging +import time +import dateutil +import datetime +import json +import os + +import kubernetes + +PLATFORM_TO_NAMESPACE = { + "Linux": "llvm-premerge-linux-buildbot", + "Windows": "llvm-premerge-windows-2022-buildbot", +} +PLATFORM_TAINT = { + "Linux": ("buildbot-platform", "linux"), + "Windows": ("node.kubernetes.io/os", "windows"), +} +PLATFORM_TO_BUILDBOT_PLATFORM = {"Linux": "linux", "Windows": "windows-2022"} +PLATFORM_CONTAINER = { + "Linux": "ghcr.io/llvm/ci-ubuntu-24.04", + "Windows": "ghcr.io/llvm/ci-windows-2022", +} +PLATFORM_TO_GCS_BUCKET_SUFFIX = { + "Linux": "-object-cache-linux", + "Windows": "-object-cache-windows", +} +LOG_SECONDS_TO_QUERY = 10 +SECONDS_QUERY_LOGS_EVERY = 5 + + +def start_build( + k8s_client, pod_name: str, platform: str, commands: list[str], args: list[str] +) -> None: + """Spawns a pod to run the specified commands. + + Args: + k8s_client: The kubernetes client instance to use for spawning the pod. + pod_name: The name of the pod to start. + platform: The platform to launch the pod on. + commands: The commands to run upon pod start. + args: Arguments to pass to the command upon pod start. + """ + taint_key, taint_value = PLATFORM_TAINT[platform] + pod_definition = { + "apiVersion": "v1", + "kind": "Pod", + "metadata": { + "name": pod_name, + "namespace": PLATFORM_TO_NAMESPACE[platform], + }, + "spec": { + "serviceAccountName": "buildbot-gcs-ksa", + "tolerations": [ + { + "key": taint_key, + "operator": "Equal", + "value": taint_value, + "effect": "NoSchedule", + } + ], + "nodeSelector": { + "buildbot-platform": PLATFORM_TO_BUILDBOT_PLATFORM[platform] + }, + "containers": [ + { + "name": "build", + "image": PLATFORM_CONTAINER[platform], + "command": commands, + "args": args, + "resources": { + "requests": {"cpu": 55, "memory": "200Gi"}, + "limits": {"cpu": 64, "memory": "256Gi"}, + }, + } + ], + "restartPolicy": "Never", + }, + } + if platform == "Windows": + pod_definition["spec"]["volumes"] = [{"name": "builddir", "emptyDir": {}}] + pod_definition["spec"]["containers"][0]["volumeMounts"] = [ + {"name": "builddir", "mountPath": "C:/_work"} + ] + kubernetes.utils.create_from_dict(k8s_client, pod_definition) + + +def start_build_linux(commit_sha: str, bucket_name: str, k8s_client) -> str: + """Starts a pod to build/test on Linux at the specified SHA.""" + pod_name = f"build-{commit_sha}" + commands = [ + "set -ex", + "git clone --depth 100 https://github.com/llvm/llvm-project", + "cd llvm-project", + f"git checkout {commit_sha}", + "export CC=clang", + "export CXX=clang++", + "export POSTCOMMIT_CI=1", + f"export SCCACHE_GCS_BUCKET={bucket_name}", + f"export CACHE_GCS_BUCKET={bucket_name}", + "export SCCACHE_GCS_RW_MODE=READ_WRITE", + "export SCCACHE_IDLE_TIMEOUT=0", + "sccache --start-server", + './.ci/monolithic-linux.sh "bolt;clang;clang-tools-extra;flang;libclc;lld;lldb;llvm;mlir;polly" "check-bolt check-clang check-clang-tools check-flang check-lld check-lldb check-llvm check-mlir check-polly check-lit" "compiler-rt;flang-rt;libc;libcxx;libcxxabi;libunwind" "check-compiler-rt check-flang-rt check-libc" "check-cxx check-cxxabi check-unwind" "OFF"', + "python .ci/cache_lit_timing_files.py upload", + "echo BUILD FINISHED", + ] + start_build( + k8s_client, pod_name, "Linux", ["/bin/bash", "-c", ";".join(commands)], [] + ) + return pod_name + + +def start_build_windows(commit_sha: str, bucket_name: str, k8s_client): + """Starts a pod to build/test on Windows at the specified SHA.""" + pod_name = f"build-{commit_sha}" + bash_commands = [ + "set -ex", + "cd C:/_work", + "git clone --config core.autocrlf=false --depth 100 https://github.com/llvm/llvm-project", + "cd llvm-project", + f"git checkout {commit_sha}", + "export POSTCOMMIT_CI=1", + f"export SCCACHE_GCS_BUCKET={bucket_name}", + f"export CACHE_GCS_BUCKET={bucket_name}", + "export SCCACHE_GCS_RW_MODE=READ_WRITE", + "export SCCACHE_IDLE_TIMEOUT=0", + "sccache --start-server", + '.ci/monolithic-windows.sh "clang;clang-tools-extra;libclc;lld;llvm;mlir;polly" "check-clang check-clang-tools check-lld check-llvm check-mlir check-polly check-lit" "compiler-rt" "check-compiler-rt"', + "python .ci/cache_lit_timing_files.py upload", + "echo BUILD FINISHED", + ] + bash_command = f"bash -c \"{';'.join(bash_commands)}\"\"" + commands = [ + "call C:\\BuildTools\\Common7\\Tools\\VsDevCmd.bat -arch=amd64 -host_arch=amd64", + bash_command, + ] + args = ["/c " + " && ".join(commands)] + start_build(k8s_client, pod_name, "Windows", ["cmd.exe"], args) + return pod_name + + +def read_logs(pod_name: str, namespace: str, v1_api) -> list[str]: + """Reads logs from the specified pod. + + Reads logs using the k8s API and returns a nicely formatted list of + strings. + + Args: + pod_name: The name of the pod to read logs from. + namespace: The namespace the pod is in. + v1_api: The kubernetes API instance to use for querying logs. + + Returns: + A list of strings representing the log lines. + """ + logs = v1_api.read_namespaced_pod_log( + name=pod_name, + namespace=namespace, + timestamps=True, + since_seconds=LOG_SECONDS_TO_QUERY, + ) + return logs.split("\n")[:-1] + + +def get_pod_status(pod_name: str, namespace: str, v1_api) -> str: + """Gets the status of a pod.""" + return v1_api.read_namespaced_pod_status( + name=pod_name, namespace=namespace + ).status.phase + + +def get_logs_to_print( + logs: list[str], latest_time: datetime.datetime +) -> tuple[datetime.datetime, list[str]]: + """Get the logs that we should be printing. + + This function takes in a raw list of logs along with the timestamp of the + last log line to be printed and returns the new log lines that should be + printed. + + Args: + logs: The raw list of log lines. + latest_time: The timestamp from the last log line that was printed. + + Returns: + A tuple containing the timestamp of the last log line returned and a list + of strings containing the log lines that should be printed. + """ + first_new_index = 0 + time_stamp = latest_time + for log_line in logs: + time_stamp_str = log_line.split(" ")[0] + time_stamp = dateutil.parser.parse(time_stamp_str[:-1]) + if time_stamp > latest_time: + break + first_new_index += 1 + last_time_stamp = latest_time + if logs: + last_time_stamp_str = logs[-1].split(" ")[0] + last_time_stamp = dateutil.parser.parse(last_time_stamp_str[:-1]) + return (last_time_stamp, logs[first_new_index:]) + + +def print_logs( + pod_name: str, namespace: str, v1_api, lastest_time: datetime.datetime +) -> tuple[bool, datetime.datetime]: + """Queries the pod and prints the relevant log lines. + + Args: + pod_name: The pod to print the logs for. + namespace: The namespace the log is in. + v1_api: The kubernetes API client instance to use for querying the logs. + latest_time: The timestamp of the last log line to be printed. + + Returns: + A tuple containing a boolean representing whether or not the pod has + finished executing and the timestamp of the last log line printed. + """ + logs = read_logs(pod_name, namespace, v1_api) + new_time_stamp, logs_to_print = get_logs_to_print(logs, lastest_time) + pod_finished = False + for log_line in logs_to_print: + print(log_line.split("\r")[-1]) + if "BUILD FINISHED" in log_line: + pod_finished = True + + return (pod_finished, new_time_stamp) + + +def main(commit_sha: str, platform: str): + kubernetes.config.load_incluster_config() + k8s_client = kubernetes.client.ApiClient() + bucket_name = ( + os.environ["BUILDBOT_REGION"] + PLATFORM_TO_GCS_BUCKET_SUFFIX[platform] + ) + if platform == "Linux": + pod_name = start_build_linux(commit_sha, bucket_name, k8s_client) + elif platform == "Windows": + pod_name = start_build_windows(commit_sha, bucket_name, k8s_client) + else: + raise ValueError("Unrecognized platform.") + namespace = PLATFORM_TO_NAMESPACE[platform] + latest_time = datetime.datetime.min + v1_api = kubernetes.client.CoreV1Api() + print("@@@BUILD_STEP Build/Test@@@") + pod_status = "Pending" + while pod_status == "Pending": + print("Waiting for the pod to schedule onto a machine.") + time.sleep(SECONDS_QUERY_LOGS_EVERY) + pod_status = get_pod_status(pod_name, namespace, v1_api) + while True: + try: + pod_finished, latest_time = print_logs( + pod_name, namespace, v1_api, latest_time + ) + if pod_finished: + break + pod_status = get_pod_status(pod_name, namespace, v1_api) + if pod_status == "Failed": + break + except kubernetes.client.exceptions.ApiException as log_exception: + if "ContainerCreating" in json.loads(log_exception.body)["message"]: + logging.warning( + "Cannot yet read logs from the pod: waiting for the container to start." + ) + else: + logging.error(f"Failed to get logs from the pod: {log_exception}") + break + time.sleep(SECONDS_QUERY_LOGS_EVERY) + pod_status = get_pod_status(pod_name, namespace, v1_api) + while pod_status == "Running": + print("Waiting for pod to complete.") + time.sleep(SECONDS_QUERY_LOGS_EVERY) + pod_status = get_pod_status(pod_name, namespace, v1_api) + v1_api.delete_namespaced_pod(pod_name, namespace) + if pod_status == "Succeeded": + print("Job Succeeded.") + sys.exit(0) + else: + print("Job Failed.") + sys.exit(1) + + +if __name__ == "__main__": + if len(sys.argv) != 2: + logging.fatal("Expected usage is dispatch_job.py {platform}") + sys.exit(1) + if "BUILDBOT_REVISION" not in os.environ: + logging.fatal("Expected to have BUILDBOT_REVISION environment variable set.") + sys.exit(1) + main(os.environ["BUILDBOT_REVISION"], sys.argv[1]) diff --git a/zorg/buildbot/builders/annotated/premerge/dispatch_job_test.py b/zorg/buildbot/builders/annotated/premerge/dispatch_job_test.py new file mode 100644 index 000000000..8a7dc311a --- /dev/null +++ b/zorg/buildbot/builders/annotated/premerge/dispatch_job_test.py @@ -0,0 +1,86 @@ +"""Tests for the dispatch_job.py script.""" + +import unittest +import datetime +import dateutil + +import dispatch_job + + +class TestDispatchJobs(unittest.TestCase): + def test_get_logs_first_time(self): + """Test we return the correct logs if we have not seen any before.""" + log_lines = [ + "2025-07-29T15:48:00.259595535Z test1", + "2025-07-29T15:48:00.383251277Z test2", + ] + current_timestamp = datetime.datetime.min + latest_timestamp, lines_to_print = dispatch_job.get_logs_to_print( + log_lines, current_timestamp + ) + self.assertSequenceEqual( + lines_to_print, + [ + "2025-07-29T15:48:00.259595535Z test1", + "2025-07-29T15:48:00.383251277Z test2", + ], + ) + self.assertEqual( + latest_timestamp, dateutil.parser.parse("2025-07-29T15:48:00.383251277") + ) + + def test_get_logs_nonoverlapping(self): + """Test we return the correct logs for non-overlapping ranges. + + Test that if the timestamp of the last log that we have printed is + less than the current set returned by kubernetes, we return the correct + lines. + """ + log_lines = [ + "2025-07-29T15:48:01.787177054Z test1", + "2025-07-29T15:48:03.074715108Z test2", + ] + current_timestamp = dateutil.parser.parse("2025-07-29T15:48:00.383251277") + latest_timestamp, lines_to_print = dispatch_job.get_logs_to_print( + log_lines, current_timestamp + ) + self.assertSequenceEqual( + lines_to_print, + [ + "2025-07-29T15:48:01.787177054Z test1", + "2025-07-29T15:48:03.074715108Z test2", + ], + ) + self.assertEqual( + latest_timestamp, dateutil.parser.parse("2025-07-29T15:48:03.074715108") + ) + + def test_get_logs_overlapping(self): + """Test we return the correct logs for overlapping ranges. + + Test that if the last line to be printed is contained within the logs + kubernetes returned, we skip the lines that have already been printed. + """ + log_lines = [ + "2025-07-29T15:48:00.383251277Z test1", + "2025-07-29T15:48:01.787177054Z test2", + "2025-07-29T15:48:03.074715108Z test3", + ] + current_timestamp = dateutil.parser.parse("2025-07-29T15:48:00.383251277") + latest_timestamp, lines_to_print = dispatch_job.get_logs_to_print( + log_lines, current_timestamp + ) + self.assertSequenceEqual( + lines_to_print, + [ + "2025-07-29T15:48:01.787177054Z test2", + "2025-07-29T15:48:03.074715108Z test3", + ], + ) + self.assertEqual( + latest_timestamp, dateutil.parser.parse("2025-07-29T15:48:03.074715108") + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/zorg/buildbot/builders/annotated/profcheck.sh b/zorg/buildbot/builders/annotated/profcheck.sh new file mode 100755 index 000000000..bf035f4bd --- /dev/null +++ b/zorg/buildbot/builders/annotated/profcheck.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +set -ex + +echo @@@BUILD_STEP CMake@@@ + +cmake -GNinja \ + -DCMAKE_BUILD_TYPE=Release \ + -DLLVM_ENABLE_ASSERTIONS=ON \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + -DLLVM_LIT_ARGS='--exclude-xfail' \ + -DLLVM_ENABLE_PROFCHECK=ON \ + ../llvm-project/llvm + +echo @@@BUILD_STEP Ninja@@@ + +export LIT_XFAIL="$(cat ../llvm-project/llvm/utils/profcheck-xfail.txt | tr '\n' ';')" +export LIT_USE_INTERNAL_SHELL=0 +ninja check-llvm diff --git a/zorg/buildbot/builders/annotated/rise-riscv-build.sh b/zorg/buildbot/builders/annotated/rise-riscv-build.sh index 4b213ef91..0c1bd54f5 100755 --- a/zorg/buildbot/builders/annotated/rise-riscv-build.sh +++ b/zorg/buildbot/builders/annotated/rise-riscv-build.sh @@ -29,7 +29,7 @@ case "$BUILDBOT_BUILDERNAME" in export BB_QEMU_MEM="64G" ;; "clang-riscv-rva23-evl-vec-2stage") - TARGET_CFLAGS="-march=rva23u64 -mllvm -force-tail-folding-style=data-with-evl -mllvm -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue" + TARGET_CFLAGS="-march=rva23u64 -mllvm -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue" export BB_IMG_DIR=$(pwd)/.. # TODO: Switch to specifying rva23u64 once qemu on the builder is # upgraded to a version that recognises it. diff --git a/zorg/buildbot/builders/annotated/rise-riscv-gauntlet-build.sh b/zorg/buildbot/builders/annotated/rise-riscv-gauntlet-build.sh index 60e4e5bbc..29103305a 100755 --- a/zorg/buildbot/builders/annotated/rise-riscv-gauntlet-build.sh +++ b/zorg/buildbot/builders/annotated/rise-riscv-gauntlet-build.sh @@ -81,7 +81,7 @@ set +e # Skip a few tests that have excessive runtimes relative to the others. export LIT_FILTER_OUT='(SingleSource/Benchmarks/Polybench/linear-algebra/solvers/(ludcmp|lu)|MicroBenchmarks/LoopVectorization/LoopInterleavingBenchmarks)' -for CONF in rva20 rva22 rva23 rva23-evl rva23-mrvv-vec-bits; do +for CONF in rva20 rva22 rva23 rva23-zvl1024b rva23-mrvv-vec-bits; do RVA23_QEMU_CPU="rv64,zba=true,zbb=true,zbc=false,zbs=true,zfhmin=true,v=true,vext_spec=v1.0,zkt=true,zvfhmin=true,zvbb=true,zvkt=true,zihintntl=true,zicond=true,zimop=true,zcmop=true,zcb=true,zfa=true,zawrs=true,rvv_ta_all_1s=true,rvv_ma_all_1s=true,rvv_vl_half_avl=true" case "$CONF" in rva20) @@ -96,9 +96,9 @@ for CONF in rva20 rva22 rva23 rva23-evl rva23-mrvv-vec-bits; do CFLAGS="-march=rva23u64" QEMU_CPU=$RVA23_QEMU_CPU ;; - rva23-evl) - CFLAGS="-march=rva23u64 -mllvm -force-tail-folding-style=data-with-evl -mllvm -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue" - QEMU_CPU=$RVA23_QEMU_CPU + rva23-zvl1024b) + CFLAGS="-march=rva23u64_zvl1024b" + QEMU_CPU="$RVA23_QEMU_CPU,vlen=1024" ;; rva23-mrvv-vec-bits) CFLAGS="-march=rva23u64 -mrvv-vector-bits=zvl" diff --git a/zorg/buildbot/builders/annotated/sanitizer-windows.py b/zorg/buildbot/builders/annotated/sanitizer-windows.py deleted file mode 100644 index d654e5b8f..000000000 --- a/zorg/buildbot/builders/annotated/sanitizer-windows.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/python - -import os -import sys -import annotated_builder -import util - - -class SanitizerAnnotatedBuilder(annotated_builder.AnnotatedBuilder): - - """Customizes the 'build' step of the generic AnnotatedBuilder""" - - def build(self, stage_name, build_dir, jobs=None): - # The basic idea here is to run 'ninja compiler-rt ; ninja clang lld'. - # This ensures that portability issues in compiler-rt code are found - # first. Then, we only build clang and lld, the primary dependencies of - # the sanitizer test suites, to keep cycle time low. This means there - # are still some remaining test dependencies (FileCheck) that may be - # compiled during the check step, but there shouldn't be that many. - self.report_build_step('%s build' % (stage_name,)) - self.halt_on_failure() - base_cmd = ['ninja'] - if jobs: - base_cmd += ['-j', str(jobs)] - early_targets = ['compiler-rt'] - late_targets = ['clang', 'lld'] - util.report_run_cmd(base_cmd + early_targets, cwd=build_dir) - util.report_run_cmd(base_cmd + late_targets, cwd=build_dir) - - -def main(argv): - ap = annotated_builder.get_argument_parser() - args = ap.parse_args(argv[1:]) - - projects = ['llvm', 'clang', 'lld', 'compiler-rt'] - stages = 1 - extra_cmake_args = [ - '-DCMAKE_BUILD_TYPE=Release', - '-DLLVM_ENABLE_PDB=ON', - '-DLLVM_ENABLE_ASSERTIONS=ON', - '-DLLVM_TARGETS_TO_BUILD=X86', - ] - check_targets = ['check-asan', 'check-asan-dynamic', 'check-sanitizer', - 'check-ubsan', 'check-fuzzer', 'check-cfi', - 'check-profile', 'check-builtins'] - - # These arguments are a bit misleading, they really mean use cl.exe for - # stage1 instead of GCC. - compiler = 'clang-cl' - linker = 'lld-link' - - builder = SanitizerAnnotatedBuilder() - builder.run_steps(stages=stages, - projects=projects, - extra_cmake_args=extra_cmake_args, - check_targets=check_targets, - compiler=compiler, - linker=linker, - jobs=args.jobs) - - -if __name__ == '__main__': - sys.path.append(os.path.dirname(__file__)) - sys.exit(main(sys.argv)) diff --git a/zorg/buildbot/builders/sanitizers/buildbot_functions.sh b/zorg/buildbot/builders/sanitizers/buildbot_functions.sh index 11d9b09a5..0d3376282 100755 --- a/zorg/buildbot/builders/sanitizers/buildbot_functions.sh +++ b/zorg/buildbot/builders/sanitizers/buildbot_functions.sh @@ -630,5 +630,5 @@ function upload_stats() { -H Metadata-Flavor:Google > "${ROOT}/machine-type.txt" || true gsutil cp "${ROOT}/"{time,cpu,machine-type}".txt" "gs://sanitizer-buildbot-out/${BUILDBOT_BUILDERNAME}/${1}/${BUILDBOT_REVISION}/" || true fi - cat "${ROOT}/time.txt" + [[ ! -f "${ROOT}/time.txt" ]] || cat "${ROOT}/time.txt" } diff --git a/zorg/jenkins/build.py b/zorg/jenkins/build.py index 9683470d1..c06500649 100644 --- a/zorg/jenkins/build.py +++ b/zorg/jenkins/build.py @@ -1019,6 +1019,14 @@ def run_ws(cmd, env=None, sudo=False, err_okay=False): return run_cmd(conf.workspace, cmd, env, sudo=sudo, err_okay=err_okay) +def print_machine_info(): + header("Machine Info") + run_ws(["sw_vers"]) + run_ws(["xcodebuild", "-version"]) + run_ws(["cmake", "--version"]) + footer() + + def parse_args(): """Get the command line arguments, and make sure they are correct.""" @@ -1088,6 +1096,7 @@ def main(): args = parse_args() conf = Configuration(args) + print_machine_info() create_builddirs() try: if args.build_type == 'clang': diff --git a/zorg/jenkins/jobs/jobs/clang-stage1-RA-as b/zorg/jenkins/jobs/jobs/clang-stage1-RA-as new file mode 100644 index 000000000..82f96d87a --- /dev/null +++ b/zorg/jenkins/jobs/jobs/clang-stage1-RA-as @@ -0,0 +1,156 @@ +pipeline { + options { + disableConcurrentBuilds() + } + + parameters { + string(name: 'LABEL', defaultValue: params.LABEL ?: 'macos-arm64', description: 'Node label to run on') + + string(name: 'GIT_REVISION', defaultValue: params.GIT_REVISION ?: '*/main', description: 'Git revision to build') + } + + agent { + node { + label params.LABEL + } + } + + stages { + stage('Checkout') { + steps { + dir('llvm-project') { + checkout([$class: 'GitSCM', branches: [ + [name: params.GIT_REVISION] + ], extensions: [ + [$class: 'CloneOption', + timeout: 30] + ], userRemoteConfigs: [ + [url: 'https://github.com/llvm/llvm-project.git'] + ]]) + } + dir('llvm-zorg') { + checkout([$class: 'GitSCM', branches: [ + [name: '*/main'] + ], userRemoteConfigs: [ + [url: 'https://github.com/llvm/llvm-zorg.git'] + ]]) + } + } + } + stage('Setup Venv') { + environment { + PATH="$PATH:/usr/bin:/usr/local/bin" + } + steps { + sh ''' + # Non-incremental, so always delete just in case. + rm -rf clang-build clang-install host-compiler *.tar.gz + rm -rf venv + python3 -m venv venv + set +u + source ./venv/bin/activate + python -m pip install -r ./llvm-zorg/zorg/jenkins/jobs/requirements.txt + set -u + ''' + } + } + stage('Build') { + environment { + PATH="$PATH:/usr/bin:/usr/local/bin" + MACOSX_DEPLOYMENT_TARGET="13.6" + } + steps { + timeout(120) { + withCredentials([string(credentialsId: 's3_resource_bucket', variable: 'S3_BUCKET')]) { + sh ''' + set -u + rm -rf build.properties + + source ./venv/bin/activate + + cd llvm-project + git tag -a -m "First Commit" first_commit 97724f18c79c7cc81ced24239eb5e883bf1398ef || true + + git_desc=$(git describe --match "first_commit") + export GIT_DISTANCE=$(echo ${git_desc} | cut -f 2 -d "-") + + sha=$(echo ${git_desc} | cut -f 3 -d "-") + export GIT_SHA=${sha:1} + + # Also save the LLVM_REV until LNT server is taught about GIT + export LLVM_REV=$(git show -q | grep "llvm-svn:" | cut -f2 -d":" | tr -d " ") + + cd - + + echo "GIT_DISTANCE=$GIT_DISTANCE" > build.properties + echo "GIT_SHA=$GIT_SHA" >> build.properties + echo "ARTIFACT=$JOB_NAME/clang-d$GIT_DISTANCE-g$GIT_SHA-t$BUILD_ID-b$BUILD_NUMBER.tar.gz" >> build.properties + + rm -rf clang-build clang-install *.tar.gz + python llvm-zorg/zorg/jenkins/monorepo_build.py cmake build \ + --assertions --cmake-type=RelWithDebInfo \ + --projects="clang;clang-tools-extra;compiler-rt" \ + --cmake-flag="-DPython3_EXECUTABLE=$(which python)" \ + --cmake-flag="-DLLVM_TARGETS_TO_BUILD=AArch64" + ''' + } + } + } + } + stage('Test') { + environment { + PATH="$PATH:/usr/bin:/usr/local/bin" + } + steps { + timeout(120) { + sh ''' + set -u + source ./venv/bin/activate + python llvm-zorg/zorg/jenkins/monorepo_build.py cmake testlong + ''' + } + } + post { + always { + script { + junit "clang-build/**/testresults.xunit.xml" + } + } + } + } + } + post { + always { + script { + // ToDo: Restore the issue scanner + // scanForIssues tool: clang() + sh "rm -rf clang-build clang-install host-compiler" + } + } + // This is commented out because we don't have downstream arm64 jobs setup yet, we will + // in the future + //success { + // script { + // if (!params.SKIP_TRIGGER) { + // // Trigger Stage 2 Jobs + // build job: 'clang-stage2-cmake-RgSan_relay-as', wait: false + // build job: 'clang-stage2-Rthinlto_relay-as', wait: false + // build job: 'relay-lnt-ctmark-as', wait: false + // build job: 'relay-test-suite-verify-machineinstrs-as', wait: false + // } + // } + //} + //unstable { + // script { + // if (!params.SKIP_TRIGGER) { + // // Trigger Stage 2 Jobs + // build job: 'clang-stage2-cmake-RgSan_relay-as', wait: false + // build job: 'clang-stage2-Rthinlto_relay-as', wait: false + // build job: 'relay-lnt-ctmark-as', wait: false + // build job: 'relay-test-suite-verify-machineinstrs-as', wait: false + // } + // } + //} + } +} +