Skip to content

Commit

Permalink
Approve jobs if at least older jobs passed
Browse files Browse the repository at this point in the history
- if aggregate update failed, do not give up immediately
- look at openQA previous jobs, if present, green, not too old,
  still present in the qem-dashboard (to avoid using tests about
  different Release Requests) and it includes the update under test:
  ignore that failure

This is to reduce the impact of one test being broken one day, a
different test another day and the update not being approved even if
combined result give all green, just not at the same time.
  • Loading branch information
Michael Grifalconi committed Mar 20, 2024
1 parent 7b921a0 commit 813f225
Show file tree
Hide file tree
Showing 4 changed files with 239 additions and 4 deletions.
1 change: 1 addition & 0 deletions openqabot/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@
OPENQA_URL = "openqa.suse.de"
DEVELOPMENT_PARENT_GROUP_ID = 9
DOWNLOAD_BASE = "http://download.suse.de/ibs/SUSE:/Maintenance:/"
OLDEST_APPROVAL_JOB_DAYS = 6
99 changes: 96 additions & 3 deletions openqabot/approver.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from logging import getLogger
from typing import List
from urllib.error import HTTPError
from datetime import timedelta, datetime
import re

import osc.conf
Expand All @@ -15,7 +16,7 @@
from openqabot.openqa import openQAInterface
from openqabot.dashboard import get_json

from . import OBS_GROUP, OBS_MAINT_PRJ, OBS_URL, QEM_DASHBOARD
from . import OBS_GROUP, OBS_MAINT_PRJ, OBS_URL, QEM_DASHBOARD, OLDEST_APPROVAL_JOB_DAYS
from .loader.qem import (
IncReq,
JobAggr,
Expand Down Expand Up @@ -131,11 +132,96 @@ def is_job_marked_acceptable_for_incident(self, job_id: int, inc: int) -> bool:
pass
return False

def job_acceptable(self, inc: int, res) -> bool:
@lru_cache(maxsize=512)
def validate_job_qam(self, job: int) -> bool:
# Check that valid test result is still present in the dashboard (see https://github.com/openSUSE/qem-dashboard/pull/78/files) to avoid using results related to an old release request
qam_data = get_json("api/jobs/" + str(job), headers=self.token)
if not qam_data:
return False
if "error" in qam_data:
log.info(
"Cannot find job %s in the dashboard database to make sure it is valid",
job,
)
return False
if qam_data["status"] != "passed":
log.info(
'Job %s is not recorded as "passed" in the qam-dashboard database',
job,
)
return False
return True

@lru_cache(maxsize=512)
def was_ok_before(self, failed_job_id: int, inc: int) -> bool:
# We need a considerable amount of older jobs, since there could be many failed manual restarts from same day
older_jobs = self.client.get_older_jobs(failed_job_id, 20)
if older_jobs == []:
log.info("Cannot find older jobs for %s", failed_job_id)
return False

current_build = older_jobs["data"][0]["build"][:-2]
current_build_date = datetime.strptime(current_build, "%Y%m%d")

# Use at most X days old build. Don't go back in time too much to reduce risk of using invalid tests
oldest_build_usable = current_build_date - timedelta(
days=OLDEST_APPROVAL_JOB_DAYS
)

regex = re.compile(r"(.*)Maintenance:/%s/(.*)" % inc)
# Skipping first job, which was the current one
for i in range(1, len(older_jobs["data"])):
job = older_jobs["data"][i]
job_build = job["build"][:-2]
job_build_date = datetime.strptime(job_build, "%Y%m%d")

# Check the job is not too old
if job_build_date < oldest_build_usable:
log.info(
"Cannot ignore aggregate failure %s for update %s because: Older jobs are too old to be considered"
% (failed_job_id, inc)
)
return False

if job["result"] != "passed" and job["result"] != "softfailed":
continue

# Check the job contains the update under test
job_settings = self.client.get_single_job(job["id"])
if not regex.match(str(job_settings)):
# Likely older jobs don't have it either. Giving up
log.info(
"Cannot ignore aggregate failure %s for update %s because: Older passing jobs do not have update under test"
% (failed_job_id, inc)
)
return False

if not self.validate_job_qam(job["id"]):
log.info(
"Cannot ignore failed aggregate %s using %s for update %s because is not present in qem-dashboard. It's likely about an older release request"
% (failed_job_id, job["id"], inc)
)
return False

log.info(
"Ignoring failed aggregate %s and using instead %s for update %s"
% (failed_job_id, job["id"], inc)
)
return True

log.info(
"Cannot ignore aggregate failure %s for update %s because: Older usable jobs did not succeed. Run out of jobs to evaluate."
% (failed_job_id, inc)
)
return False

def job_acceptable(self, inc: int, api: str, res) -> bool:
"""
Check each job if it is acceptable for different reasons.
Keep jobs marked as acceptable for one incident by openQA comments.
Keep jobs marked as acceptable if are aggregate and were ok in the previous days.
"""
if res["status"] == "passed":
return True
Expand All @@ -145,6 +231,13 @@ def job_acceptable(self, inc: int, res) -> bool:
"Ignoring failed job %s for incident %s due to openQA comment", url, inc
)
return True
if api == "api/jobs/update/" and self.was_ok_before(res["job_id"], inc):
log.info(
"Ignoring failed aggregate job %s for incident %s due to older eligible openQA job being ok",
url,
inc,
)
return True
log.info("Found failed, not-ignored job %s for incident %s", url, inc)
return False

Expand All @@ -156,7 +249,7 @@ def get_jobs(self, job_aggr: JobAggr, api: str, inc: int) -> bool:
"Job setting %s not found for incident %s"
% (str(job_aggr.id), str(inc))
)
return all(self.job_acceptable(inc, r) for r in results)
return all(self.job_acceptable(inc, api, r) for r in results)

def get_incident_result(self, jobs: List[JobAggr], api: str, inc: int) -> bool:
res = False
Expand Down
25 changes: 25 additions & 0 deletions openqabot/openqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,28 @@ def is_devel_group(self, groupid: int) -> bool:
return (
ret[0]["parent_id"] == DEVELOPMENT_PARENT_GROUP_ID if ret else True
) # ID of Development Group

@lru_cache(maxsize=256)
def get_single_job(self, job_id: int):
ret = None
try:
ret = self.openqa.openqa_request(
"GET",
"jobs/%s" % job_id,
)["job"]
except RequestError as e:
log.exception(e)
return ret

@lru_cache(maxsize=256)
def get_older_jobs(self, job_id: int, limit: int):
ret = []
try:
ret = self.openqa.openqa_request(
"GET",
"/tests/%s/ajax?previous_limit=%s&next_limit=0" % (job_id, limit),
retries=self.retries,
)
except RequestError as e:
log.exception(e)
return ret
118 changes: 117 additions & 1 deletion tests/test_approve.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,57 @@
openqa_instance_url = urlparse("http://instance.qa")


@pytest.fixture(scope="function")
def fake_responses_for_unblocking_incidents_via_older_ok_result(request):
responses.add(
responses.GET,
f"{QEM_DASHBOARD}api/jobs/update/20005",
json=[
{"job_id": 100000, "status": "passed"},
{"job_id": 100002, "status": "failed"},
{"job_id": 100003, "status": "passed"},
],
)
add_two_passed_response()
responses.add(
responses.GET,
url="http://instance.qa/api/v1/jobs/100002/comments",
json=[{"text": "@review:acceptable_for:incident_555:foo"}],
)
responses.add(
responses.GET,
re.compile(r"http://instance.qa/tests/.*/ajax\?previous_limit=.*&next_limit=0"),
json={
"data": [
{"build": "20240115-1", "id": 100002, "result": "failed"},
{"build": "20240114-1", "id": 100004, "result": "failed"},
{"build": "20240113-1", "id": 100005, "result": "softfailed"},
]
},
)
responses.add(
responses.GET,
re.compile(f"http://instance.qa/api/v1/jobs/.*"),
json={
"job": {
"settings": {
"BASE_TEST_REPOS": "http://download.suse.de/ibs/SUSE:/Maintenance:/1111/SUSE_Updates_SLE-Module-Basesystem_15-SP5_x86_64/,http://download.suse.de/ibs/SUSE:/Maintenance:/%s/SUSE_Updates_SLE-Module-Basesystem_15-SP5_x86_64/"
% request.param
}
}
},
)


@pytest.fixture(scope="function")
def fake_openqa_older_jobs_api():
responses.add(
responses.GET,
re.compile(r"http://instance.qa/tests/.*/ajax\?previous_limit=.*&next_limit=0"),
status=404,
)


def add_two_passed_response():
responses.add(
responses.GET,
Expand Down Expand Up @@ -404,7 +455,11 @@ def test_one_incident_failed(
@responses.activate
@pytest.mark.parametrize("fake_qem", [("NoResultsError isn't raised")], indirect=True)
def test_one_aggr_failed(
fake_qem, fake_openqa_comment_api, fake_responses_updating_job, caplog
fake_qem,
fake_openqa_comment_api,
fake_responses_updating_job,
fake_openqa_older_jobs_api,
caplog,
):
caplog.set_level(logging.DEBUG, logger="bot.approver")

Expand Down Expand Up @@ -466,6 +521,67 @@ def test_approval_still_blocked_if_openqa_comment_not_relevant(
fake_qem,
fake_responses_for_unblocking_incidents_via_openqa_comments,
fake_openqa_comment_api,
fake_openqa_older_jobs_api,
caplog,
):
caplog.set_level(logging.DEBUG, logger="bot.approver")
assert approver() == 0
messages = [x[-1] for x in caplog.record_tuples]
assert "* SUSE:Maintenance:2:200" not in messages


@responses.activate
@pytest.mark.parametrize("fake_qem", [("NoResultsError isn't raised")], indirect=True)
@pytest.mark.parametrize(
"fake_responses_for_unblocking_incidents_via_older_ok_result", [(2)], indirect=True
)
def test_approval_unblocked_via_openqa_older_ok_job(
fake_qem,
fake_responses_for_unblocking_incidents_via_older_ok_result,
caplog,
):
caplog.set_level(logging.DEBUG, logger="bot.approver")
responses.add(
responses.GET,
re.compile(f"{QEM_DASHBOARD}api/jobs/100005"),
json={"status": "passed"},
)
assert approver() == 0
messages = [x[-1] for x in caplog.record_tuples]
assert "* SUSE:Maintenance:2:200" in messages


@responses.activate
@pytest.mark.parametrize("fake_qem", [("NoResultsError isn't raised")], indirect=True)
@pytest.mark.parametrize(
"fake_responses_for_unblocking_incidents_via_older_ok_result", [(2)], indirect=True
)
def test_approval_still_blocked_via_openqa_older_ok_job_because_not_in_dashboard(
fake_qem,
fake_responses_for_unblocking_incidents_via_older_ok_result,
caplog,
):
caplog.set_level(logging.DEBUG, logger="bot.approver")
responses.add(
responses.GET,
re.compile(f"{QEM_DASHBOARD}api/jobs/100005"),
json={"error": "Job not found"},
)
assert approver() == 0
messages = [x[-1] for x in caplog.record_tuples]
assert "* SUSE:Maintenance:2:200" not in messages


@responses.activate
@pytest.mark.parametrize("fake_qem", [("NoResultsError isn't raised")], indirect=True)
@pytest.mark.parametrize(
"fake_responses_for_unblocking_incidents_via_older_ok_result",
[(2222)],
indirect=True,
)
def test_approval_still_blocked_if_openqa_older_job_dont_include_incident(
fake_qem,
fake_responses_for_unblocking_incidents_via_older_ok_result,
caplog,
):
caplog.set_level(logging.DEBUG, logger="bot.approver")
Expand Down

0 comments on commit 813f225

Please sign in to comment.