Skip to content

Commit

Permalink
Initial front end caching
Browse files Browse the repository at this point in the history
There are times where neither the kernel revision nor the compiler have
changed since the last run, which means there is little point running the
build because it is unlikely anything would change (except due to
flakiness with GitHub Actions, more on that later). While tuxsuite does
have compiler caching enabled, it still relies on spinning up all the
build machines, letting the cache work, then firing off the boot tests,
which can be expensive.

By caching the compiler string and the kernel revision, it is possible
to avoid even spinning up the tuxsuite jobs if nothing has changed.
should_run.py takes care of this by exiting:

    * 0 if the build should run (something changed or there was no
      previous results)
    * 1 for any internal assertion failure
    * 2 if nothing changed from the previous run

If there is any internal assertion failure, the cache step fails,
failing the whole workflow, so that those situations can be properly
dealt with. The script should never do this, but it is possible there
are things I have not considered happening yet.

To avoid contention with pushing and pulling, each should_run.py job
gets its own branch. While this will result in a lot of branches, they
should not cause too many issues because they will just contain the JSON
file with the last run info.

should_run.py does not try to account for flakiness on the GitHub
Actions or TuxSuite side, so it is possible that a previous build will
fail due to flakiness and not be retried on the next cron if nothing
changes since then. To attempt to account for a situation where we
re-run a known flaky build, the script gets out of the way when the
GitHub Actions event is "workflow_dispatch", meaning a workflow was
manually run. Additionally, if the previous run was only flaky during
the QEMU boots (rather than during the TuxSuite stage), they can
generally just be re-run right away, since the kernels do not need to be
rebuilt. I do not think this will happen too often but if it does, we
can try to come up with a better heuristic.

Closes: ClangBuiltLinux#308
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
  • Loading branch information
nathanchance committed Feb 26, 2023
1 parent 4497803 commit 360a478
Show file tree
Hide file tree
Showing 2 changed files with 201 additions and 2 deletions.
62 changes: 60 additions & 2 deletions generate_workflow.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
#!/usr/bin/env python3

import argparse
import contextlib
import hashlib
from pathlib import Path
import sys
import yaml

Expand Down Expand Up @@ -85,11 +87,55 @@ def sanitize_job_name(name):


def tuxsuite_setups(job_name, tuxsuite_yml, repo, ref):
patch_series = patch_series_flag(
tuxsuite_yml.split("/")[1].split("-clang-")[0])
tuxsuite_yml_name = Path(tuxsuite_yml).name
# Input: '<tree>-clang-<num>.tux.yml'
# Output: [<tree_parts>, 'clang', <num>]
workflow_parts = tuxsuite_yml_name.replace('.tux.yml', '').split('-')

tree = '-'.join(workflow_parts[0:-2])
patch_series = patch_series_flag(tree)

ci_folder = Path(__file__).resolve().parent
with Path(ci_folder, 'LLVM_TOT_VERSION').open(encoding='utf-8') as file:
max_version = int(file.read())
llvm_version = workflow_parts[-1]
# if llvm_version is 'android', converting it to an integer will fail. We
# do not not care, just suppress the error and move on.
with contextlib.suppress(ValueError):
if int(llvm_version) == max_version:
llvm_version = 'nightly'
return {
f"cache_check_{job_name}": {
"name": f"cache check ({job_name})",
"runs-on": "ubuntu-latest",
"container": f"tuxmake/clang-{llvm_version}",
"outputs": {
"should_run": "${{ steps.should_run.outputs.should_run }}",
},
"permissions": "write-all",
"steps": [
{
"uses": "actions/checkout@v3",
},
{
"name": "Should build run?",
"id": "should_run",
"run": ('if python3 should_run.py || { ret=$?; ( exit $ret ) }; then\n'
' echo "should_run=true" >>$GITHUB_OUTPUT\n'
'else\n'
' case $ret in\n'
' 2) echo "should_run=false" >>$GITHUB_OUTPUT ;;\n'
' *) exit 1 ;;\n'
' esac\n'
'fi\n'),
"env": {"GITHUB_TOKEN": '${{ secrets.GITHUB_TOKEN }}'},
},
],
},
f"kick_tuxsuite_{job_name}": {
"name": f"TuxSuite ({job_name})",
"needs": f"cache_check_{job_name}",
"if": f"${{{{ needs.cache_check_{job_name}.outputs.should_run == 'true' }}}}",
# https://docs.github.com/en/free-pro-team@latest/actions/reference/workflow-syntax-for-github-actions#jobsjob_idruns-on
"runs-on": "ubuntu-latest",
"container": "tuxsuite/tuxsuite",
Expand Down Expand Up @@ -222,6 +268,18 @@ def print_builds(config, tree_name, llvm_version):
sys.stdout = orig_stdout


# https://github.com/yaml/pyyaml/issues/240
# https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
def str_presenter(dumper, data):
if data.count('\n') > 0: # check for multiline string
return dumper.represent_scalar('tag:yaml.org,2002:str',
data,
style='|')
return dumper.represent_scalar('tag:yaml.org,2002:str', data)


yaml.add_representer(str, str_presenter)

if __name__ == "__main__":
generated_config = get_config_from_generator()
args = parse_args(generated_config["trees"])
Expand Down
141 changes: 141 additions & 0 deletions should_run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
#!/usr/bin/env python3

import json
import os
from pathlib import Path
import subprocess
import sys

import yaml

import utils

if 'GITHUB_ACTIONS' not in os.environ:
raise RuntimeError('Not running on GitHub Actions?')

if 'GITHUB_TOKEN' not in os.environ:
raise RuntimeError('No GITHUB_TOKEN was specified?')

github = {
'actor': os.environ['GITHUB_ACTOR'],
'event_name': os.environ['GITHUB_EVENT_NAME'],
'job': os.environ['GITHUB_JOB'],
'token': os.environ['GITHUB_TOKEN'],
'repository': os.environ['GITHUB_REPOSITORY'],
'repository_owner': os.environ['GITHUB_REPOSITORY_OWNER'],
'workflow_ref': os.environ['GITHUB_WORKFLOW_REF'],
'workspace': Path(os.environ['GITHUB_WORKSPACE']),
}

if github['repository_owner'] != 'ClangBuiltLinux':
raise RuntimeError('Not running in ClangBuiltLinux repo, exiting...')
# A workflow dispatch means we want the workflow to run because it is being
# called manually, do not bother checking anything further in this case.
if github['event_name'] == 'workflow_dispatch':
print(
'Workflow was called manually, exiting with return code 0 to run tuxsuite...',
)
sys.exit(0)

# Input: <owner>/<repo>/.github/workflows/<workflow>.yml@refs/heads/<branch>
# Output: <owner>/<repo>/.github/workflows/<workflow>.yml
workflow_path = Path(github['workflow_ref'].split('@', 1)[0])
workflow_stem = workflow_path.stem
# branch name is <workflow>-<job>, so that it is entirely unique for updating
branch = f"{workflow_stem}-{github['job']}"

# Configure git
repo = Path(github['workspace'], github['workspace'].name)
git_configs = {
'safe.directory': repo,
'user.name': f"{github['actor']} via GitHub Actions",
'user.email': f"{github['actor']}@users.noreply.github.com",
}
for key, val in git_configs.items():
subprocess.run(['git', 'config', '--global', key, val], check=True)

# Clone repository
subprocess.run(
['git', 'clone', f"https://github.com/{github['repository']}", repo],
check=True)

# Down out of band to avoid leaking GITHUB_TOKEN, the push will fail later
# if this does not work, so check=False.
new_remote = f"https://{github['actor']}:{github['token']}@github.com/{github['repository']}"
subprocess.run(['git', 'remote', 'set-url', 'origin', new_remote],
check=False,
cwd=repo)

# If there is no branch in the repository for the current workflow, create one
try:
subprocess.run(['git', 'checkout', branch], check=True, cwd=repo)
except subprocess.CalledProcessError:
subprocess.run(['git', 'checkout', '--orphan', branch],
check=True,
cwd=repo)
subprocess.run(['git', 'rm', '-fr', '.'], check=True, cwd=repo)

# Get compiler string
compiler = subprocess.run(['clang', '--version'],
capture_output=True,
check=True,
text=True).stdout.splitlines()[0]

# Get current sha of remote
# Input: <tree>-clang-<num>
# Output: <tree>
# Have to split then join because tree could have a hyphen
# pylint: disable-next=invalid-name ??
tree_name = '-'.join(workflow_stem.split('-')[0:-2])
with Path(github['workspace'], 'generator.yml').open(encoding='utf-8') as file:
config = yaml.safe_load(file)
url, ref = utils.get_repo_ref(config, tree_name)
ls_rem = subprocess.run(['git', 'ls-remote', url, ref],
capture_output=True,
check=True,
text=True)
# Input: <sha>\tref/heads/<ref>
# Output: <sha>
sha = ls_rem.stdout.split('\t', 1)[0]

info_json = Path(repo, 'last_run_info.json')
new_run_info = {
'compiler': compiler,
'sha': sha,
}

# If the file already exists...
if info_json.exists():
with info_json.open(encoding='utf-8') as file:
old_run_info = json.load(file)
# compare the two, writing to disk and breaking as soon as there is a
# difference
for key in old_run_info:
if old_run_info[key] != new_run_info[key]:
with info_json.open('w', encoding='utf-8') as file:
json.dump(new_run_info, file, indent=4, sort_keys=True)
break
else:
# Otherwise, create and write to the file
with info_json.open('w', encoding='utf-8') as file:
json.dump(new_run_info, file, indent=4, sort_keys=True)

subprocess.run(['git', 'add', info_json.name], check=True, cwd=repo)
status = subprocess.run(['git', 'status', '--porcelain', '-u'],
capture_output=True,
check=True,
cwd=repo,
text=True)
if not status.stdout: # No changes, we do not need to run
print(
f"I: No changes to {info_json.name} detected, exiting with return code 2 to skip running tuxsuite...",
)
sys.exit(2)

subprocess.run(['git', 'commit', '-m', f"{branch}: Update last_run_info.json"],
check=True,
cwd=repo)
subprocess.run(['git', 'push', 'origin', f"HEAD:{branch}"],
check=True,
cwd=repo)
print('I: Exiting with return code 0 to run tuxsuite...')

0 comments on commit 360a478

Please sign in to comment.