Skip to content

Commit

Permalink
[Runtimes] Fix builder pod cleanup (#1259)
Browse files Browse the repository at this point in the history
  • Loading branch information
Hedingber committed Aug 29, 2021
1 parent 68d4f31 commit 9317ee2
Show file tree
Hide file tree
Showing 8 changed files with 240 additions and 59 deletions.
26 changes: 22 additions & 4 deletions mlrun/api/api/endpoints/runtime_resources.py
@@ -1,3 +1,4 @@
import copy
import http
import typing

Expand Down Expand Up @@ -197,6 +198,7 @@ def _delete_runtime_resources(
(
allowed_projects,
grouped_by_project_runtime_resources_output,
is_non_project_runtime_resource_exists,
) = _get_runtime_resources_allowed_projects(
project,
auth_info,
Expand All @@ -211,16 +213,26 @@ def _delete_runtime_resources(
allowed_projects
)
if label_selector:
label_selector = ",".join([label_selector, permissions_label_selector])
computed_label_selector = ",".join(
[label_selector, permissions_label_selector]
)
else:
label_selector = permissions_label_selector
computed_label_selector = permissions_label_selector
mlrun.api.crud.RuntimeResources().delete_runtime_resources(
db_session, kind, object_id, computed_label_selector, force, grace_period,
)
if is_non_project_runtime_resource_exists:
# delete one more time, without adding the allowed projects selector
mlrun.api.crud.RuntimeResources().delete_runtime_resources(
db_session, kind, object_id, label_selector, force, grace_period,
)
if return_body:
filtered_projects = copy.deepcopy(allowed_projects)
if is_non_project_runtime_resource_exists:
filtered_projects.append("")
return mlrun.api.crud.RuntimeResources().filter_and_format_grouped_by_project_runtime_resources_output(
grouped_by_project_runtime_resources_output,
allowed_projects,
filtered_projects,
mlrun.api.schemas.ListRuntimeResourcesGroupByField.project,
)
else:
Expand All @@ -244,6 +256,7 @@ def _list_runtime_resources(
(
allowed_projects,
grouped_by_project_runtime_resources_output,
_,
) = _get_runtime_resources_allowed_projects(
project, auth_info, label_selector, kind_filter, object_id
)
Expand All @@ -260,7 +273,7 @@ def _get_runtime_resources_allowed_projects(
object_id: typing.Optional[str] = None,
action: mlrun.api.schemas.AuthorizationAction = mlrun.api.schemas.AuthorizationAction.read,
) -> typing.Tuple[
typing.List[str], mlrun.api.schemas.GroupedByProjectRuntimeResourcesOutput,
typing.List[str], mlrun.api.schemas.GroupedByProjectRuntimeResourcesOutput, bool
]:
if project != "*":
mlrun.api.utils.clients.opa.Client().query_project_permissions(
Expand All @@ -275,10 +288,14 @@ def _get_runtime_resources_allowed_projects(
mlrun.api.schemas.ListRuntimeResourcesGroupByField.project,
)
projects = []
is_non_project_runtime_resource_exists = False
for (
project,
kind_runtime_resources_map,
) in grouped_by_project_runtime_resources_output.items():
if not project:
is_non_project_runtime_resource_exists = True
continue
projects.append(project)
allowed_projects = mlrun.api.utils.clients.opa.Client().filter_project_resources_by_permissions(
mlrun.api.schemas.AuthorizationResourceTypes.runtime_resource,
Expand All @@ -290,6 +307,7 @@ def _get_runtime_resources_allowed_projects(
return (
allowed_projects,
grouped_by_project_runtime_resources_output,
is_non_project_runtime_resource_exists,
)


Expand Down
6 changes: 6 additions & 0 deletions mlrun/builder.py
Expand Up @@ -58,6 +58,7 @@ def make_dockerfile(


def make_kaniko_pod(
project: str,
context,
dest,
dockerfile=None,
Expand Down Expand Up @@ -89,6 +90,7 @@ def make_kaniko_pod(
config.httpdb.builder.kaniko_image,
args=args,
kind="build",
project=project,
)
kpod.env = builder_env

Expand Down Expand Up @@ -136,6 +138,7 @@ def upload_tarball(source_dir, target, secrets=None):


def build_image(
project: str,
dest,
commands=None,
source="",
Expand Down Expand Up @@ -223,6 +226,7 @@ def build_image(
)

kpod = make_kaniko_pod(
project,
context,
dest,
dockertext=dock,
Expand Down Expand Up @@ -276,6 +280,7 @@ def build_runtime(
):
build = runtime.spec.build
namespace = runtime.metadata.namespace
project = runtime.metadata.project
if skip_deployed and runtime.is_deployed:
runtime.status.state = mlrun.api.schemas.FunctionState.ready
return True
Expand Down Expand Up @@ -308,6 +313,7 @@ def build_runtime(
with_mlrun = False

status = build_image(
project,
build.image,
base_image=base_image,
commands=build.commands,
Expand Down
8 changes: 7 additions & 1 deletion mlrun/k8s_utils.py
Expand Up @@ -427,6 +427,7 @@ def __init__(
args=None,
namespace="",
kind="job",
project=None,
):
self.namespace = namespace
self.name = ""
Expand All @@ -437,7 +438,12 @@ def __init__(
self._volumes = []
self._mounts = []
self.env = None
self._labels = {"mlrun/task-name": task_name, "mlrun/class": kind}
self.project = project or mlrun.mlconf.default_project
self._labels = {
"mlrun/task-name": task_name,
"mlrun/class": kind,
"mlrun/project": self.project,
}
self._annotations = {}
self._init_container = None

Expand Down
19 changes: 13 additions & 6 deletions mlrun/runtimes/base.py
Expand Up @@ -1259,6 +1259,10 @@ def _are_resources_coupled_to_run_object() -> bool:
"""
return False

@staticmethod
def _expect_pods_without_uid() -> bool:
return False

def _list_pods(self, namespace: str, label_selector: str = None) -> List:
k8s_helper = get_k8s_helper()
pods = k8s_helper.list_pods(namespace, selector=label_selector)
Expand Down Expand Up @@ -1554,11 +1558,14 @@ def _pre_deletion_runtime_resource_run_actions(

# if cannot resolve related run nothing to do
if not uid:
logger.warning(
"Could not resolve run uid from runtime resource. Skipping pre-deletion actions",
runtime_resource=runtime_resource,
)
raise ValueError("Could not resolve run uid from runtime resource")
if not self._expect_pods_without_uid():
logger.warning(
"Could not resolve run uid from runtime resource. Skipping pre-deletion actions",
runtime_resource=runtime_resource,
)
raise ValueError("Could not resolve run uid from runtime resource")
else:
return

logger.info(
"Performing pre-deletion actions before cleaning up runtime resources",
Expand Down Expand Up @@ -1751,7 +1758,7 @@ def _add_resource_to_grouped_by_project_resources_response(
resource: mlrun.api.schemas.RuntimeResource,
):
if "mlrun/class" in resource.labels:
project = resource.labels.get("mlrun/project", config.default_project)
project = resource.labels.get("mlrun/project", "")
mlrun_class = resource.labels["mlrun/class"]
kind = self._resolve_kind_from_class(mlrun_class)
self._add_resource_to_grouped_by_field_resources_response(
Expand Down
8 changes: 8 additions & 0 deletions mlrun/runtimes/kubejob.py
Expand Up @@ -316,6 +316,14 @@ def func_to_pod(image, runtime, extra_env, command, args, workdir):


class KubeRuntimeHandler(BaseRuntimeHandler):
@staticmethod
def _expect_pods_without_uid() -> bool:
"""
builder pods are handled as part of this runtime handler - they are not coupled to run object, therefore they
don't have the uid in their labels
"""
return True

@staticmethod
def _are_resources_coupled_to_run_object() -> bool:
return True
Expand Down
91 changes: 86 additions & 5 deletions tests/api/api/test_runtime_resources.py
Expand Up @@ -322,6 +322,41 @@ def test_delete_runtime_resources_opa_filtering(
assert response.status_code == http.HTTPStatus.NO_CONTENT.value


def test_delete_runtime_resources_with_legacy_builder_pod_opa_filtering(
db: sqlalchemy.orm.Session, client: fastapi.testclient.TestClient
) -> None:
(
project_1,
project_1_job_name,
no_project_builder_name,
grouped_by_project_runtime_resources_output,
) = _generate_grouped_by_project_runtime_resources_with_legacy_builder_output()

mlrun.api.crud.RuntimeResources().list_runtime_resources = unittest.mock.Mock(
return_value=grouped_by_project_runtime_resources_output
)

allowed_projects = []
mlrun.api.utils.clients.opa.Client().filter_project_resources_by_permissions = unittest.mock.Mock(
return_value=allowed_projects
)
# no projects are allowed, but there is a non project runtime resource (the legacy builder pod)
# therefore delete resources will be called, but without filter on project in the label selector
_mock_runtime_handlers_delete_resources(
mlrun.runtimes.RuntimeKinds.runtime_with_handlers(), allowed_projects
)
response = client.delete("/api/projects/*/runtime-resources",)
body = response.json()
expected_body = _filter_allowed_projects_from_grouped_by_project_runtime_resources_output(
[""], grouped_by_project_runtime_resources_output
)
assert deepdiff.DeepDiff(body, expected_body, ignore_order=True,) == {}

# legacy endpoint
response = client.delete("/api/runtimes",)
assert response.status_code == http.HTTPStatus.NO_CONTENT.value


def test_delete_runtime_resources_with_kind(
db: sqlalchemy.orm.Session, client: fastapi.testclient.TestClient
) -> None:
Expand Down Expand Up @@ -420,12 +455,13 @@ def _assert_delete_resources_label_selector(
force: bool = False,
grace_period: int = mlrun.mlconf.runtime_resources_deletion_grace_period,
):
assert (
mlrun.api.api.endpoints.runtime_resources._generate_label_selector_for_allowed_projects(
allowed_projects
if allowed_projects:
assert (
mlrun.api.api.endpoints.runtime_resources._generate_label_selector_for_allowed_projects(
allowed_projects
)
in label_selector
)
in label_selector
)

for kind in kinds:
runtime_handler = mlrun.runtimes.get_runtime_handler(kind)
Expand All @@ -452,6 +488,51 @@ def _assert_empty_responses_in_delete_endpoints(client: fastapi.testclient.TestC
assert response.status_code == http.HTTPStatus.NO_CONTENT.value


def _generate_grouped_by_project_runtime_resources_with_legacy_builder_output():
no_project = ""
project_1 = "project-1"
project_1_job_name = "project-1-job-name"
no_project_builder_name = "builder-name"
grouped_by_project_runtime_resources_output = {
project_1: {
mlrun.runtimes.RuntimeKinds.job: mlrun.api.schemas.RuntimeResources(
pod_resources=[
mlrun.api.schemas.RuntimeResource(
name=project_1_job_name,
labels={
"mlrun/project": project_1,
# using name as uid to make assertions easier later
"mlrun/uid": project_1_job_name,
"mlrun/class": mlrun.runtimes.RuntimeKinds.job,
},
)
],
crd_resources=[],
)
},
no_project: {
mlrun.runtimes.RuntimeKinds.job: mlrun.api.schemas.RuntimeResources(
pod_resources=[
mlrun.api.schemas.RuntimeResource(
name=no_project_builder_name,
labels={
"mlrun/class": "build",
"mlrun/task-name": "some-task-name",
},
)
],
crd_resources=[],
),
},
}
return (
project_1,
project_1_job_name,
no_project_builder_name,
grouped_by_project_runtime_resources_output,
)


def _generate_grouped_by_project_runtime_resources_output():
project_1 = "project-1"
project_2 = "project-2"
Expand Down
4 changes: 3 additions & 1 deletion tests/api/runtime_handlers/base.py
Expand Up @@ -176,6 +176,8 @@ def _assert_runtime_handler_list_resources(
runtime_handler=runtime_handler,
)

return resources

def _assert_list_resources_grouped_by_job_response(
self,
resources: mlrun.api.schemas.GroupedByJobRuntimeResourcesOutput,
Expand Down Expand Up @@ -203,7 +205,7 @@ def _assert_list_resources_grouped_by_project_response(
def _extract_project_and_kind_from_runtime_resources_labels(
labels: dict,
) -> typing.Tuple[str, str]:
project = labels["mlrun/project"]
project = labels.get("mlrun/project", "")
class_ = labels["mlrun/class"]
kind = runtime_handler._resolve_kind_from_class(class_)
return project, kind
Expand Down

0 comments on commit 9317ee2

Please sign in to comment.