Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

openqa-investigate: Provide support for multi-machine scenarios #170

Merged
merged 4 commits into from
Jul 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
82 changes: 72 additions & 10 deletions openqa-investigate
Original file line number Diff line number Diff line change
Expand Up @@ -26,20 +26,23 @@ curl_args=(--user-agent "openqa-investigate")
echoerr() { echo "$@" >&2; }

clone() {
local origin id name_suffix refspec job_data unsupported_cluster_jobs name base_prio clone_settings casedir repo out url clone_id
local origin id name_suffix refspec job_data unsupported_cluster_jobs pending_cluster_jobs name base_prio clone_settings casedir repo out url clone_id
origin=${1:?"Need 'origin'"}
id=${2:?"Need 'id'"}
name_suffix=${3+":$3"}
refspec=${4+$4}
job_data=$(openqa-cli "${client_args[@]}" --json jobs/"$id")
# shellcheck disable=SC2181
[[ $? != 0 ]] && echoerr "unable to query job data for $id: $job_data" && return 1
unsupported_cluster_jobs=$(echo "$job_data" | runjq -r '(.job.children["Parallel"] | length) + (.job.parents["Parallel"] | length) + (.job.children["Directly chained"] | length) + (.job.parents["Directly chained"] | length)') || return $?

# fail on jobs with directly chained dependencies (not supported)
unsupported_cluster_jobs=$(echo "$job_data" | runjq -r '(.job.children["Directly chained"] | length) + (.job.parents["Directly chained"] | length)') || return $?
[[ $unsupported_cluster_jobs != 0 ]] \
&& echoerr "unable to clone job $id: it is part of a parallel or directly chained cluster (not supported)" && return 2
&& echoerr "Unable to clone job $id: it is part of a directly chained cluster (not supported)" && return 2

name="$(echo "$job_data" | runjq -r '.job.test'):investigate$name_suffix" || return $?
base_prio=$(echo "$job_data" | runjq -r '.job.priority') || return $?
clone_settings=("TEST=$name" '_GROUP_ID=0' 'BUILD=')
clone_settings=("TEST+=:investigate$name_suffix" '_GROUP_ID=0' 'BUILD=')
if [[ $refspec ]]; then
casedir=$(echo "$job_data" | runjq -r '.job.settings.CASEDIR') || return $?
[[ $casedir == null ]] && casedir=''
Expand Down Expand Up @@ -119,6 +122,57 @@ trigger_jobs() {
fi
}

query_dependency_data_or_postpone() {
local id=$1 job_data=$2 dependency_data pending_cluster_jobs

# postpone if not all dependencies are done/cancelled
# note: This "AJAX" route is normally used to render the dependencies tab in the web UI.
dependency_data=$(openqa-cli "${client_args[@]}" --apibase '' --json tests/"$id"/dependencies_ajax)
pending_cluster_jobs=$(echo "$dependency_data" | runjq -r '[.nodes[] | select(.state != "done" and .state != "cancelled")] | length') || return $?
[[ $pending_cluster_jobs != 0 ]] \
&& echoerr "Postponing to investigate job $id: waiting until pending dependencies have finished" && return 142

# do not skip the job
echo "$dependency_data"
return 255
}

sync_via_investigation_comment() {
local id=$1 first_cluster_job_id=$2

comment_id=$("${client_call[@]}" -X POST jobs/"$first_cluster_job_id"/comments text="Starting investigation for job $id" | runjq -r '.id') || return $?
first_comment_id=$("${client_call[@]}" -X GET jobs/"$first_cluster_job_id"/comments | runjq -r '[.[] | select(.text | contains("investigation"))] | sort_by(.id) | first | .id') || return $?

# delete comment again in case a concurrent job could start the investigation before us
if [[ $comment_id != "$first_comment_id" ]]; then
echoerr "Skipping investigation of job $id: job cluster is already being investigated, see comment on job $first_cluster_job_id"
"${client_call[@]}" -X DELETE jobs/"$first_cluster_job_id"/comments/"$comment_id" && return 0
fi

echo "$comment_id"
return 255
}

finalize_investigation_comment() {
local id=$1 first_cluster_job_id=$2 comment_id=$3 comment_text=$4

# delete comment again if there were no investigation jobs needed after all
if ! [[ $comment_text ]]; then
"${client_call[@]}" -X DELETE jobs/"$first_cluster_job_id"/comments/"$comment_id"
return 0
fi

local comment="Automatic investigation jobs for job $id:

$comment_text"
"${client_call[@]}" -X PUT jobs/"$first_cluster_job_id"/comments/"$comment_id" text="$comment"

# also write a comment on the job we're actually investigating
if [[ $first_cluster_job_id != "$id" ]]; then
"${client_call[@]}" -X POST jobs/"$id"/comments text="$comment"
fi
}

# crosscheck
# 1. current job/build + current test -> check if reproducible/sporadic
# 2. current job/build + last good test (+ last good needles) -> check for
Expand Down Expand Up @@ -151,6 +205,14 @@ investigate() {
echoerr "Job already has a clone, skipping investigation. Use the env variable 'force=true' to trigger investigation jobs"
return 0
fi

# determine dependency data or postpone if cluster not done
dependency_data=$(query_dependency_data_or_postpone "$id" "$job_data"); rc=$?
[[ $rc != 255 ]] && return $rc

# determine the job in the cluster with the lowest ID to use that for commenting/synchronization
first_cluster_job_id=$(echo "$dependency_data" | runjq -r "[$id, [.cluster[] | select(contains([$id]))]] | flatten | sort | first") || return $?

[[ "$old_name" =~ $exclude_name_regex ]] && echo "Job name '$old_name' matches \$exclude_name_regex '$exclude_name_regex', skipping investigation" && return 0
group="$(echo "$job_data" | runjq -r '.job.parent_group + " / " + .job.group')" || return $?
[[ "$group" = " / " ]] && [[ "$exclude_no_group" = "true" ]] && echo "Job w/o job group, \$exclude_no_group is set, skipping investigation" && return 0
Expand All @@ -160,13 +222,13 @@ investigate() {
# method instead for we are just working based on supplied job which can
# have more, ambiguous potential changes that we need to bisect on

# sync by writing initial investigation comment (edited later)
comment_id=$(sync_via_investigation_comment "$id" "$first_cluster_job_id"); rc=$?
[[ $rc != 255 ]] && return $rc

out=$(trigger_jobs "$id" "${@:2}")
$verbose && echo "$0, id: '$id', out: '$out'"
[[ $out ]] || return 0
comment="Automatic investigation jobs:

$out"
"${client_call[@]}" -X POST jobs/"$id"/comments text="$comment"
finalize_investigation_comment "$id" "$first_cluster_job_id" "$comment_id" "$out"
}

main() {
Expand All @@ -180,7 +242,7 @@ main() {
fi
fi
set -u
clone_call="${clone_call:-"$client_prefix openqa-clone-job --skip-chained-deps --within-instance"}"
clone_call="${clone_call:-"$client_prefix openqa-clone-job --skip-chained-deps --max-depth 0 --parental-inheritance --within-instance"}"
error_count=0
# shellcheck disable=SC2013
for i in $(cat - | sed 's/ .*$//'); do
Expand Down
78 changes: 74 additions & 4 deletions test/02-investigate.t
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ PATH=$BASHLIB$PATH

source bash+ :std
use Test::More
plan tests 10
plan tests 24

host=localhost
url=https://localhost
Expand All @@ -36,13 +36,38 @@ cli_rc=0
consider_parallel_and_directly_chained_clusters=1
out=$(clone 41 42 2>&1 > /dev/null) || rc=$?
is "$rc" 2 'fails when no jobs could be restarted'
is "$out" "unable to clone job 42: it is part of a parallel or directly chained cluster (not supported)" 'restart error on stderr'
is "$out" "Unable to clone job 42: it is part of a directly chained cluster (not supported)" 'restart error on stderr'

openqa-cli() {
if [[ "$1 $2" == "--json jobs/24" ]]; then
echo '{"job": { "test": "vim", "priority": 50, "settings" : {} } }'
elif [[ "$1 $2" == "--json jobs/27" ]]; then
echo '{"job": { "test": "vim", "clone_id" : 28 } }'
elif [[ $@ == "-X POST jobs/30/comments text=Starting investigation for job 31" ]]; then
echo '{"id": 1234}'
elif [[ $@ == $'-X PUT jobs/30/comments/1234 text=Automatic investigation jobs for job 31:\n\nfoo' ]]; then
touch comment_1234_updated
elif [[ $@ == '-X DELETE jobs/30/comments/1234' ]]; then
touch comment_1234_deleted
elif [[ $@ == $'-X POST jobs/31/comments text=Automatic investigation jobs for job 31:\n\nfoo' ]]; then
touch comment_for_job_31_created
elif [[ $@ == "-X GET jobs/30/comments" ]]; then
echo '[{"id": 1234, "text":"Starting investigation for 31"},{"id": 1235, "text":"unrelated comment"}]'
elif [[ $@ == "-X POST jobs/32/comments text=Starting investigation for job 32" ]]; then
echo '{"id": 1237}'
elif [[ $@ == "-X GET jobs/32/comments" ]]; then
echo '[{"id": 1236, "text":"Starting investigation for job 32"},{"id": 1237, "text":"Starting investigation for job 32"}]'
elif [[ $@ == '--apibase --json tests/27/dependencies_ajax' ]]; then
echo '{"cluster":{}, "edges":[], "nodes":[{"id":27,"state":"done","result":"passed"}]}'
elif [[ $@ == '--apibase --json tests/28/dependencies_ajax' ]]; then
echo '{"cluster":{}, "edges":[], "nodes":[{"id":28,"state":"done","result":"failed"}]}'
elif [[ $@ == '--apibase --json tests/29/dependencies_ajax' ]]; then
echo '{"cluster":{"cluster_foo":[28],"cluster_bar":[29]}, "edges":[], "nodes":[{"id":28,"state":"done","result":"failed"},{"id":29,"state":"done","result":"passed"}]}'
elif [[ $@ == '--apibase --json tests/30/dependencies_ajax' ]]; then
echo '{"cluster":{"cluster_foo":[28,30],"cluster_bar":[29]}, "edges":[], "nodes":[{"id":28,"state":"uploading","result":"none"},{"id":30,"state":"done","result":"passed"}]}'
elif [[ $@ == '--apibase --json tests/31/dependencies_ajax' ]]; then
# job with cancelled job in the cluster (should be treated like a done job)
echo '{"cluster":{"cluster_foo":[28,31],"cluster_bar":[29]}, "edges":[], "nodes":[{"id":28,"state":"cancelled","result":"none"},{"id":31,"state":"done","result":"failed"}]}'
else
echo '{"result": [{ "25": "foo", "26": "bar" }], "test_url": [{"25": "/tests/25", "26": "/tests/26"}] } '
fi
Expand All @@ -61,6 +86,51 @@ is "$rc" 0 'success regardless of actually triggered jobs'
is "$out" "Job already has a clone, skipping investigation. Use the env variable 'force=true' to trigger investigation jobs"

rc=0
out=$(force=true investigate 27 2>&1) || rc=$?
is "$rc" 0 'still success'
out=$(force=true investigate 28 2>&1) || rc=$?
is "$rc" 0 'still success when job is skipped (because of exclude_no_group)'
like "$out" "exclude_no_group is set, skipping investigation"

rc=0
out=$(investigate 30 2>&1) || rc=$?
is "$rc" 142 'investigation postponed because other job in cluster is not done'
like "$out" "Postponing to investigate job 30: waiting until pending dependencies have finished"

rc=0
out=$(force=true investigate 31 2>&1) || rc=$?
is "$rc" 0 'success when job is skipped (because of exclude_no_group and job w/o group)'
like "$out" 'Job w/o job group, \$exclude_no_group is set, skipping investigation'

# test syncing via investigation comment; we're first
rc=0
out=$(force=true sync_via_investigation_comment 31 30 2>&1) || rc=$?
is "$rc" 255 'do not skip if we own first investigation comment'
like "$out" '1234' 'comment ID returned'

# test syncing via investigation comment; we're second
rc=0
out=$(force=true sync_via_investigation_comment 32 32 2>&1) || rc=$?
is "$rc" 0 'skip with success if we do not own first investigation comment'
like "$out" '' 'no output when skipping'

# delete certain files used to trace whether API calls happened
for trace_file in comment_1234_updated comment_for_job_31_created comment_1234_deleted; do
[[ -f $trace_file ]] && unlink "$trace_file"
done

# test finalizing investigation comment when no investigation jobs were needed
rc=0
out=$(force=true finalize_investigation_comment 31 30 1234 '' 2>&1) || rc=$?
is "$rc" 0 'success if no investigation jobs needed to be created after all'
[[ -f comment_1234_deleted ]] && comment_1234_deleted=1 || comment_1234_deleted=0
okurz marked this conversation as resolved.
Show resolved Hide resolved
[[ -f comment_for_job_31_created ]] && comment_for_job_31_created=1 || comment_for_job_31_created=0
is "$comment_1234_deleted" 1 'comment on job 30 deleted'
is "$comment_for_job_31_created" 0 'no comment on job 31 created'

# test finalizing investigation comment when investigation jobs had been created
rc=0
out=$(force=true finalize_investigation_comment 31 30 1234 'foo' 2>&1) || rc=$?
is "$rc" 0 'success if we write an investigation comment'
[[ -f comment_1234_updated ]] && comment_1234_updated=1 || comment_1234_updated=0
[[ -f comment_for_job_31_created ]] && comment_for_job_31_created=1 || comment_for_job_31_created=0
is "$comment_1234_updated" 1 'comment on job 30 updated'
is "$comment_for_job_31_created" 1 'comment on job 31 created as well'