Approve jobs if at least older jobs passed

- if aggregate update failed, do not give up immediately - look at openQA previous jobs, if present, green, not too old, still present in the qem-dashboard (to avoid using tests about different Release Requests) and it includes the update under test: ignore that failure This is to reduce the impact of one test being broken one day, a different test another day and the update not being approved even if combined result give all green, just not at the same time.
openSUSE · Mar 20, 2024 · 813f225 · 813f225
1 parent 7b921a0
commit 813f225
Show file tree

Hide file tree

Showing 4 changed files with 239 additions and 4 deletions.
diff --git a/openqabot/__init__.py b/openqabot/__init__.py
@@ -9,3 +9,4 @@
 OPENQA_URL = "openqa.suse.de"
 DEVELOPMENT_PARENT_GROUP_ID = 9
 DOWNLOAD_BASE = "http://download.suse.de/ibs/SUSE:/Maintenance:/"
+OLDEST_APPROVAL_JOB_DAYS = 6
diff --git a/openqabot/approver.py b/openqabot/approver.py
@@ -5,6 +5,7 @@
 from logging import getLogger
 from typing import List
 from urllib.error import HTTPError
+from datetime import timedelta, datetime
 import re
 
 import osc.conf
@@ -15,7 +16,7 @@
 from openqabot.openqa import openQAInterface
 from openqabot.dashboard import get_json
 
-from . import OBS_GROUP, OBS_MAINT_PRJ, OBS_URL, QEM_DASHBOARD
+from . import OBS_GROUP, OBS_MAINT_PRJ, OBS_URL, QEM_DASHBOARD, OLDEST_APPROVAL_JOB_DAYS
 from .loader.qem import (
     IncReq,
     JobAggr,
@@ -131,11 +132,96 @@ def is_job_marked_acceptable_for_incident(self, job_id: int, inc: int) -> bool:
             pass
         return False
 
-    def job_acceptable(self, inc: int, res) -> bool:
+    @lru_cache(maxsize=512)
+    def validate_job_qam(self, job: int) -> bool:
+        # Check that valid test result is still present in the dashboard (see https://github.com/openSUSE/qem-dashboard/pull/78/files) to avoid using results related to an old release request
+        qam_data = get_json("api/jobs/" + str(job), headers=self.token)
+        if not qam_data:
+            return False
+        if "error" in qam_data:
+            log.info(
+                "Cannot find job %s in the dashboard database to make sure it is valid",
+                job,
+            )
+            return False
+        if qam_data["status"] != "passed":
+            log.info(
+                'Job %s is not recorded as "passed" in the qam-dashboard database',
+                job,
+            )
+            return False
+        return True
+
+    @lru_cache(maxsize=512)
+    def was_ok_before(self, failed_job_id: int, inc: int) -> bool:
+        # We need a considerable amount of older jobs, since there could be many failed manual restarts from same day
+        older_jobs = self.client.get_older_jobs(failed_job_id, 20)
+        if older_jobs == []:
+            log.info("Cannot find older jobs for %s", failed_job_id)
+            return False
+
+        current_build = older_jobs["data"][0]["build"][:-2]
+        current_build_date = datetime.strptime(current_build, "%Y%m%d")
+
+        # Use at most X days old build. Don't go back in time too much to reduce risk of using invalid tests
+        oldest_build_usable = current_build_date - timedelta(
+            days=OLDEST_APPROVAL_JOB_DAYS
+        )
+
+        regex = re.compile(r"(.*)Maintenance:/%s/(.*)" % inc)
+        # Skipping first job, which was the current one
+        for i in range(1, len(older_jobs["data"])):
+            job = older_jobs["data"][i]
+            job_build = job["build"][:-2]
+            job_build_date = datetime.strptime(job_build, "%Y%m%d")
+
+            # Check the job is not too old
+            if job_build_date < oldest_build_usable:
+                log.info(
+                    "Cannot ignore aggregate failure %s for update %s because: Older jobs are too old to be considered"
+                    % (failed_job_id, inc)
+                )
+                return False
+
+            if job["result"] != "passed" and job["result"] != "softfailed":
+                continue
+
+            # Check the job contains the update under test
+            job_settings = self.client.get_single_job(job["id"])
+            if not regex.match(str(job_settings)):
+                # Likely older jobs don't have it either. Giving up
+                log.info(
+                    "Cannot ignore aggregate failure %s for update %s because: Older passing jobs do not have update under test"
+                    % (failed_job_id, inc)
+                )
+                return False
+
+            if not self.validate_job_qam(job["id"]):
+                log.info(
+                    "Cannot ignore failed aggregate %s using %s for update %s because is not present in qem-dashboard. It's likely about an older release request"
+                    % (failed_job_id, job["id"], inc)
+                )
+                return False
+
+            log.info(
+                "Ignoring failed aggregate %s and using instead %s for update %s"
+                % (failed_job_id, job["id"], inc)
+            )
+            return True
+
+        log.info(
+            "Cannot ignore aggregate failure %s for update %s because: Older usable jobs did not succeed. Run out of jobs to evaluate."
+            % (failed_job_id, inc)
+        )
+        return False
+
+    def job_acceptable(self, inc: int, api: str, res) -> bool:
         """
         Check each job if it is acceptable for different reasons.
 
         Keep jobs marked as acceptable for one incident by openQA comments.
+
+        Keep jobs marked as acceptable if are aggregate and were ok in the previous days.
         """
         if res["status"] == "passed":
             return True
@@ -145,6 +231,13 @@ def job_acceptable(self, inc: int, res) -> bool:
                 "Ignoring failed job %s for incident %s due to openQA comment", url, inc
             )
             return True
+        if api == "api/jobs/update/" and self.was_ok_before(res["job_id"], inc):
+            log.info(
+                "Ignoring failed aggregate job %s for incident %s due to older eligible openQA job being ok",
+                url,
+                inc,
+            )
+            return True
         log.info("Found failed, not-ignored job %s for incident %s", url, inc)
         return False
 
@@ -156,7 +249,7 @@ def get_jobs(self, job_aggr: JobAggr, api: str, inc: int) -> bool:
                 "Job setting %s not found for incident %s"
                 % (str(job_aggr.id), str(inc))
             )
-        return all(self.job_acceptable(inc, r) for r in results)
+        return all(self.job_acceptable(inc, api, r) for r in results)
 
     def get_incident_result(self, jobs: List[JobAggr], api: str, inc: int) -> bool:
         res = False

diff --git a/openqabot/openqa.py b/openqabot/openqa.py
@@ -105,3 +105,28 @@ def is_devel_group(self, groupid: int) -> bool:
         return (
             ret[0]["parent_id"] == DEVELOPMENT_PARENT_GROUP_ID if ret else True
         )  # ID of Development Group
+
+    @lru_cache(maxsize=256)
+    def get_single_job(self, job_id: int):
+        ret = None
+        try:
+            ret = self.openqa.openqa_request(
+                "GET",
+                "jobs/%s" % job_id,
+            )["job"]
+        except RequestError as e:
+            log.exception(e)
+        return ret
+
+    @lru_cache(maxsize=256)
+    def get_older_jobs(self, job_id: int, limit: int):
+        ret = []
+        try:
+            ret = self.openqa.openqa_request(
+                "GET",
+                "/tests/%s/ajax?previous_limit=%s&next_limit=0" % (job_id, limit),
+                retries=self.retries,
+            )
+        except RequestError as e:
+            log.exception(e)
+        return ret
diff --git a/tests/test_approve.py b/tests/test_approve.py
@@ -24,6 +24,57 @@
 openqa_instance_url = urlparse("http://instance.qa")
 
 
+@pytest.fixture(scope="function")
+def fake_responses_for_unblocking_incidents_via_older_ok_result(request):
+    responses.add(
+        responses.GET,
+        f"{QEM_DASHBOARD}api/jobs/update/20005",
+        json=[
+            {"job_id": 100000, "status": "passed"},
+            {"job_id": 100002, "status": "failed"},
+            {"job_id": 100003, "status": "passed"},
+        ],
+    )
+    add_two_passed_response()
+    responses.add(
+        responses.GET,
+        url="http://instance.qa/api/v1/jobs/100002/comments",
+        json=[{"text": "@review:acceptable_for:incident_555:foo"}],
+    )
+    responses.add(
+        responses.GET,
+        re.compile(r"http://instance.qa/tests/.*/ajax\?previous_limit=.*&next_limit=0"),
+        json={
+            "data": [
+                {"build": "20240115-1", "id": 100002, "result": "failed"},
+                {"build": "20240114-1", "id": 100004, "result": "failed"},
+                {"build": "20240113-1", "id": 100005, "result": "softfailed"},
+            ]
+        },
+    )
+    responses.add(
+        responses.GET,
+        re.compile(f"http://instance.qa/api/v1/jobs/.*"),
+        json={
+            "job": {
+                "settings": {
+                    "BASE_TEST_REPOS": "http://download.suse.de/ibs/SUSE:/Maintenance:/1111/SUSE_Updates_SLE-Module-Basesystem_15-SP5_x86_64/,http://download.suse.de/ibs/SUSE:/Maintenance:/%s/SUSE_Updates_SLE-Module-Basesystem_15-SP5_x86_64/"
+                    % request.param
+                }
+            }
+        },
+    )
+
+
+@pytest.fixture(scope="function")
+def fake_openqa_older_jobs_api():
+    responses.add(
+        responses.GET,
+        re.compile(r"http://instance.qa/tests/.*/ajax\?previous_limit=.*&next_limit=0"),
+        status=404,
+    )
+
+
 def add_two_passed_response():
     responses.add(
         responses.GET,
@@ -404,7 +455,11 @@ def test_one_incident_failed(
 @responses.activate
 @pytest.mark.parametrize("fake_qem", [("NoResultsError isn't raised")], indirect=True)
 def test_one_aggr_failed(
-    fake_qem, fake_openqa_comment_api, fake_responses_updating_job, caplog
+    fake_qem,
+    fake_openqa_comment_api,
+    fake_responses_updating_job,
+    fake_openqa_older_jobs_api,
+    caplog,
 ):
     caplog.set_level(logging.DEBUG, logger="bot.approver")
 
@@ -466,6 +521,67 @@ def test_approval_still_blocked_if_openqa_comment_not_relevant(
     fake_qem,
     fake_responses_for_unblocking_incidents_via_openqa_comments,
     fake_openqa_comment_api,
+    fake_openqa_older_jobs_api,
+    caplog,
+):
+    caplog.set_level(logging.DEBUG, logger="bot.approver")
+    assert approver() == 0
+    messages = [x[-1] for x in caplog.record_tuples]
+    assert "* SUSE:Maintenance:2:200" not in messages
+
+
+@responses.activate
+@pytest.mark.parametrize("fake_qem", [("NoResultsError isn't raised")], indirect=True)
+@pytest.mark.parametrize(
+    "fake_responses_for_unblocking_incidents_via_older_ok_result", [(2)], indirect=True
+)
+def test_approval_unblocked_via_openqa_older_ok_job(
+    fake_qem,
+    fake_responses_for_unblocking_incidents_via_older_ok_result,
+    caplog,
+):
+    caplog.set_level(logging.DEBUG, logger="bot.approver")
+    responses.add(
+        responses.GET,
+        re.compile(f"{QEM_DASHBOARD}api/jobs/100005"),
+        json={"status": "passed"},
+    )
+    assert approver() == 0
+    messages = [x[-1] for x in caplog.record_tuples]
+    assert "* SUSE:Maintenance:2:200" in messages
+
+
+@responses.activate
+@pytest.mark.parametrize("fake_qem", [("NoResultsError isn't raised")], indirect=True)
+@pytest.mark.parametrize(
+    "fake_responses_for_unblocking_incidents_via_older_ok_result", [(2)], indirect=True
+)
+def test_approval_still_blocked_via_openqa_older_ok_job_because_not_in_dashboard(
+    fake_qem,
+    fake_responses_for_unblocking_incidents_via_older_ok_result,
+    caplog,
+):
+    caplog.set_level(logging.DEBUG, logger="bot.approver")
+    responses.add(
+        responses.GET,
+        re.compile(f"{QEM_DASHBOARD}api/jobs/100005"),
+        json={"error": "Job not found"},
+    )
+    assert approver() == 0
+    messages = [x[-1] for x in caplog.record_tuples]
+    assert "* SUSE:Maintenance:2:200" not in messages
+
+
+@responses.activate
+@pytest.mark.parametrize("fake_qem", [("NoResultsError isn't raised")], indirect=True)
+@pytest.mark.parametrize(
+    "fake_responses_for_unblocking_incidents_via_older_ok_result",
+    [(2222)],
+    indirect=True,
+)
+def test_approval_still_blocked_if_openqa_older_job_dont_include_incident(
+    fake_qem,
+    fake_responses_for_unblocking_incidents_via_older_ok_result,
     caplog,
 ):
     caplog.set_level(logging.DEBUG, logger="bot.approver")