Initial front end caching

There are times where neither the kernel revision nor the compiler have changed since the last run, which means there is little point running the build because it is unlikely anything would change (except due to flakiness with GitHub Actions, more on that later). While tuxsuite does have compiler caching enabled, it still relies on spinning up all the build machines, letting the cache work, then firing off the boot tests, which can be expensive. By caching the compiler string and the kernel revision, it is possible to avoid even spinning up the tuxsuite jobs if nothing has changed. should_run.py takes care of this by exiting: * 0 if the build should run (something changed or there was no previous results) * 1 for any internal assertion failure * 2 if nothing changed from the previous run If there is any internal assertion failure, the cache step fails, failing the whole workflow, so that those situations can be properly dealt with. The script should never do this, but it is possible there are things I have not considered happening yet. To avoid contention with pushing and pulling, each should_run.py job gets its own branch. While this will result in a lot of branches, they should not cause too many issues because they will just contain the JSON file with the last run info. should_run.py does not try to account for flakiness on the GitHub Actions or TuxSuite side, so it is possible that a previous build will fail due to flakiness and not be retried on the next cron if nothing changes since then. To attempt to account for a situation where we re-run a known flaky build, the script gets out of the way when the GitHub Actions event is "workflow_dispatch", meaning a workflow was manually run. Additionally, if the previous run was only flaky during the QEMU boots (rather than during the TuxSuite stage), they can generally just be re-run right away, since the kernels do not need to be rebuilt. I do not think this will happen too often but if it does, we can try to come up with a better heuristic. Closes: ClangBuiltLinux#308 Signed-off-by: Nathan Chancellor <nathan@kernel.org>
nathanchance · Feb 26, 2023 · 360a478 · 360a478
1 parent 4497803
commit 360a478
Show file tree

Hide file tree

Showing 2 changed files with 201 additions and 2 deletions.
diff --git a/generate_workflow.py b/generate_workflow.py
@@ -1,7 +1,9 @@
 #!/usr/bin/env python3
 
 import argparse
+import contextlib
 import hashlib
+from pathlib import Path
 import sys
 import yaml
 
@@ -85,11 +87,55 @@ def sanitize_job_name(name):
 
 
 def tuxsuite_setups(job_name, tuxsuite_yml, repo, ref):
-    patch_series = patch_series_flag(
-        tuxsuite_yml.split("/")[1].split("-clang-")[0])
+    tuxsuite_yml_name = Path(tuxsuite_yml).name
+    # Input: '<tree>-clang-<num>.tux.yml'
+    # Output: [<tree_parts>, 'clang', <num>]
+    workflow_parts = tuxsuite_yml_name.replace('.tux.yml', '').split('-')
+
+    tree = '-'.join(workflow_parts[0:-2])
+    patch_series = patch_series_flag(tree)
+
+    ci_folder = Path(__file__).resolve().parent
+    with Path(ci_folder, 'LLVM_TOT_VERSION').open(encoding='utf-8') as file:
+        max_version = int(file.read())
+    llvm_version = workflow_parts[-1]
+    # if llvm_version is 'android', converting it to an integer will fail. We
+    # do not not care, just suppress the error and move on.
+    with contextlib.suppress(ValueError):
+        if int(llvm_version) == max_version:
+            llvm_version = 'nightly'
     return {
+        f"cache_check_{job_name}": {
+            "name": f"cache check ({job_name})",
+            "runs-on": "ubuntu-latest",
+            "container": f"tuxmake/clang-{llvm_version}",
+            "outputs": {
+                "should_run": "${{ steps.should_run.outputs.should_run }}",
+            },
+            "permissions": "write-all",
+            "steps": [
+                {
+                    "uses": "actions/checkout@v3",
+                },
+                {
+                    "name": "Should build run?",
+                    "id": "should_run",
+                    "run": ('if python3 should_run.py || { ret=$?; ( exit $ret ) }; then\n'
+                            '  echo "should_run=true" >>$GITHUB_OUTPUT\n'
+                            'else\n'
+                            '    case $ret in\n'
+                            '      2) echo "should_run=false" >>$GITHUB_OUTPUT ;;\n'
+                            '      *) exit 1 ;;\n'
+                            '    esac\n'
+                            'fi\n'),
+                    "env": {"GITHUB_TOKEN": '${{ secrets.GITHUB_TOKEN }}'},
+                },
+            ],
+        },
         f"kick_tuxsuite_{job_name}": {
             "name": f"TuxSuite ({job_name})",
+            "needs": f"cache_check_{job_name}",
+            "if": f"${{{{ needs.cache_check_{job_name}.outputs.should_run == 'true' }}}}",
             # https://docs.github.com/en/free-pro-team@latest/actions/reference/workflow-syntax-for-github-actions#jobsjob_idruns-on
             "runs-on": "ubuntu-latest",
             "container": "tuxsuite/tuxsuite",
@@ -222,6 +268,18 @@ def print_builds(config, tree_name, llvm_version):
         sys.stdout = orig_stdout
 
 
+# https://github.com/yaml/pyyaml/issues/240
+# https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
+def str_presenter(dumper, data):
+    if data.count('\n') > 0:  # check for multiline string
+        return dumper.represent_scalar('tag:yaml.org,2002:str',
+                                       data,
+                                       style='|')
+    return dumper.represent_scalar('tag:yaml.org,2002:str', data)
+
+
+yaml.add_representer(str, str_presenter)
+
 if __name__ == "__main__":
     generated_config = get_config_from_generator()
     args = parse_args(generated_config["trees"])

diff --git a/should_run.py b/should_run.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+
+import json
+import os
+from pathlib import Path
+import subprocess
+import sys
+
+import yaml
+
+import utils
+
+if 'GITHUB_ACTIONS' not in os.environ:
+    raise RuntimeError('Not running on GitHub Actions?')
+
+if 'GITHUB_TOKEN' not in os.environ:
+    raise RuntimeError('No GITHUB_TOKEN was specified?')
+
+github = {
+    'actor': os.environ['GITHUB_ACTOR'],
+    'event_name': os.environ['GITHUB_EVENT_NAME'],
+    'job': os.environ['GITHUB_JOB'],
+    'token': os.environ['GITHUB_TOKEN'],
+    'repository': os.environ['GITHUB_REPOSITORY'],
+    'repository_owner': os.environ['GITHUB_REPOSITORY_OWNER'],
+    'workflow_ref': os.environ['GITHUB_WORKFLOW_REF'],
+    'workspace': Path(os.environ['GITHUB_WORKSPACE']),
+}
+
+if github['repository_owner'] != 'ClangBuiltLinux':
+    raise RuntimeError('Not running in ClangBuiltLinux repo, exiting...')
+# A workflow dispatch means we want the workflow to run because it is being
+# called manually, do not bother checking anything further in this case.
+if github['event_name'] == 'workflow_dispatch':
+    print(
+        'Workflow was called manually, exiting with return code 0 to run tuxsuite...',
+    )
+    sys.exit(0)
+
+# Input: <owner>/<repo>/.github/workflows/<workflow>.yml@refs/heads/<branch>
+# Output: <owner>/<repo>/.github/workflows/<workflow>.yml
+workflow_path = Path(github['workflow_ref'].split('@', 1)[0])
+workflow_stem = workflow_path.stem
+# branch name is <workflow>-<job>, so that it is entirely unique for updating
+branch = f"{workflow_stem}-{github['job']}"
+
+# Configure git
+repo = Path(github['workspace'], github['workspace'].name)
+git_configs = {
+    'safe.directory': repo,
+    'user.name': f"{github['actor']} via GitHub Actions",
+    'user.email': f"{github['actor']}@users.noreply.github.com",
+}
+for key, val in git_configs.items():
+    subprocess.run(['git', 'config', '--global', key, val], check=True)
+
+# Clone repository
+subprocess.run(
+    ['git', 'clone', f"https://github.com/{github['repository']}", repo],
+    check=True)
+
+# Down out of band to avoid leaking GITHUB_TOKEN, the push will fail later
+# if this does not work, so check=False.
+new_remote = f"https://{github['actor']}:{github['token']}@github.com/{github['repository']}"
+subprocess.run(['git', 'remote', 'set-url', 'origin', new_remote],
+               check=False,
+               cwd=repo)
+
+# If there is no branch in the repository for the current workflow, create one
+try:
+    subprocess.run(['git', 'checkout', branch], check=True, cwd=repo)
+except subprocess.CalledProcessError:
+    subprocess.run(['git', 'checkout', '--orphan', branch],
+                   check=True,
+                   cwd=repo)
+    subprocess.run(['git', 'rm', '-fr', '.'], check=True, cwd=repo)
+
+# Get compiler string
+compiler = subprocess.run(['clang', '--version'],
+                          capture_output=True,
+                          check=True,
+                          text=True).stdout.splitlines()[0]
+
+# Get current sha of remote
+# Input: <tree>-clang-<num>
+# Output: <tree>
+# Have to split then join because tree could have a hyphen
+# pylint: disable-next=invalid-name ??
+tree_name = '-'.join(workflow_stem.split('-')[0:-2])
+with Path(github['workspace'], 'generator.yml').open(encoding='utf-8') as file:
+    config = yaml.safe_load(file)
+url, ref = utils.get_repo_ref(config, tree_name)
+ls_rem = subprocess.run(['git', 'ls-remote', url, ref],
+                        capture_output=True,
+                        check=True,
+                        text=True)
+# Input: <sha>\tref/heads/<ref>
+# Output: <sha>
+sha = ls_rem.stdout.split('\t', 1)[0]
+
+info_json = Path(repo, 'last_run_info.json')
+new_run_info = {
+    'compiler': compiler,
+    'sha': sha,
+}
+
+# If the file already exists...
+if info_json.exists():
+    with info_json.open(encoding='utf-8') as file:
+        old_run_info = json.load(file)
+    # compare the two, writing to disk and breaking as soon as there is a
+    # difference
+    for key in old_run_info:
+        if old_run_info[key] != new_run_info[key]:
+            with info_json.open('w', encoding='utf-8') as file:
+                json.dump(new_run_info, file, indent=4, sort_keys=True)
+            break
+else:
+    # Otherwise, create and write to the file
+    with info_json.open('w', encoding='utf-8') as file:
+        json.dump(new_run_info, file, indent=4, sort_keys=True)
+
+subprocess.run(['git', 'add', info_json.name], check=True, cwd=repo)
+status = subprocess.run(['git', 'status', '--porcelain', '-u'],
+                        capture_output=True,
+                        check=True,
+                        cwd=repo,
+                        text=True)
+if not status.stdout:  # No changes, we do not need to run
+    print(
+        f"I: No changes to {info_json.name} detected, exiting with return code 2 to skip running tuxsuite...",
+    )
+    sys.exit(2)
+
+subprocess.run(['git', 'commit', '-m', f"{branch}: Update last_run_info.json"],
+               check=True,
+               cwd=repo)
+subprocess.run(['git', 'push', 'origin', f"HEAD:{branch}"],
+               check=True,
+               cwd=repo)
+print('I: Exiting with return code 0 to run tuxsuite...')