From 147a9607b8e6cd0030b0aae118eed894d86fed9f Mon Sep 17 00:00:00 2001 From: Christian Berendt Date: Fri, 5 Dec 2025 10:23:44 +0100 Subject: [PATCH] Use setup_cloud_environment() for OpenStack connections in manage commands Replace direct get_cloud_connection() calls with setup_cloud_environment() and cleanup_cloud_environment() pattern in all manage commands. This ensures proper cloud configuration by: - Copying /etc/openstack/clouds.yaml to /tmp - Loading passwords securely from vault via get_cloud_password() - Creating /tmp/secure.yml with decrypted credentials - Properly cleaning up temporary files after execution Affected commands: - manage loadbalancer list/reset/delete - manage amphora restore/rotate - manage volume list/repair - manage server migrate/list/clean Also adds --cloud parameter to ServerMigrate for consistency. AI-assisted: Claude Code Signed-off-by: Christian Berendt --- osism/commands/amphora.py | 124 +++++---- osism/commands/loadbalancer.py | 292 +++++++++++--------- osism/commands/server.py | 473 ++++++++++++++++++--------------- osism/commands/volume.py | 381 ++++++++++++++------------ 4 files changed, 711 insertions(+), 559 deletions(-) diff --git a/osism/commands/amphora.py b/osism/commands/amphora.py index d618fadf..ac533360 100644 --- a/osism/commands/amphora.py +++ b/osism/commands/amphora.py @@ -8,8 +8,8 @@ from loguru import logger import openstack -from osism.commands import get_cloud_connection from osism.commands.octavia import wait_for_amphora_boot, wait_for_amphora_delete +from osism.tasks.openstack import cleanup_cloud_environment, setup_cloud_environment # Default age threshold for rotation (30 days in seconds) DEFAULT_ROTATION_AGE_SECONDS = 2592000 @@ -38,24 +38,32 @@ def take_action(self, parsed_args): cloud = parsed_args.cloud loadbalancer_id = parsed_args.loadbalancer - conn = get_cloud_connection(cloud) + temp_files, original_cwd, success = setup_cloud_environment(cloud) + if not success: + logger.error(f"Failed to setup cloud environment for '{cloud}'") + return 1 - if loadbalancer_id: - amphorae = conn.load_balancer.amphorae( - status="ERROR", loadbalancer_id=loadbalancer_id - ) - else: - amphorae = conn.load_balancer.amphorae(status="ERROR") + try: + conn = openstack.connect(cloud=cloud) - for amphora in amphorae: - logger.info( - f"Amphora {amphora.id} of loadbalancer {amphora.loadbalancer_id} is in state ERROR, trigger amphora failover" - ) - conn.load_balancer.failover_amphora(amphora.id) - sleep(10) # wait for the octavia API + if loadbalancer_id: + amphorae = conn.load_balancer.amphorae( + status="ERROR", loadbalancer_id=loadbalancer_id + ) + else: + amphorae = conn.load_balancer.amphorae(status="ERROR") + + for amphora in amphorae: + logger.info( + f"Amphora {amphora.id} of loadbalancer {amphora.loadbalancer_id} is in state ERROR, trigger amphora failover" + ) + conn.load_balancer.failover_amphora(amphora.id) + sleep(10) # wait for the octavia API - wait_for_amphora_boot(conn, amphora.loadbalancer_id) - wait_for_amphora_delete(conn, amphora.loadbalancer_id) + wait_for_amphora_boot(conn, amphora.loadbalancer_id) + wait_for_amphora_delete(conn, amphora.loadbalancer_id) + finally: + cleanup_cloud_environment(temp_files, original_cwd) class AmphoraRotate(Command): @@ -88,48 +96,58 @@ def take_action(self, parsed_args): loadbalancer_id = parsed_args.loadbalancer force = parsed_args.force - conn = get_cloud_connection(cloud) - - done = [] + temp_files, original_cwd, success = setup_cloud_environment(cloud) + if not success: + logger.error(f"Failed to setup cloud environment for '{cloud}'") + return 1 - if loadbalancer_id: - amphorae = conn.load_balancer.amphorae( - status="ALLOCATED", loadbalancer_id=loadbalancer_id - ) - else: - amphorae = conn.load_balancer.amphorae(status="ALLOCATED") + try: + conn = openstack.connect(cloud=cloud) - for amphora in amphorae: - rotate = False + done = [] - if amphora.loadbalancer_id in done: - continue - - duration = datetime.now(timezone.utc) - dateutil_parser.parse( - amphora.created_at - ) - if duration.total_seconds() > DEFAULT_ROTATION_AGE_SECONDS: - logger.info(f"Amphora {amphora.id} is older than 30 days") - rotate = True - elif force: - logger.info(f"Force rotation of Amphora {amphora.id}") - rotate = True + if loadbalancer_id: + amphorae = conn.load_balancer.amphorae( + status="ALLOCATED", loadbalancer_id=loadbalancer_id + ) else: - continue + amphorae = conn.load_balancer.amphorae(status="ALLOCATED") - if rotate: - logger.info( - f"Amphora {amphora.id} of loadbalancer {amphora.loadbalancer_id} is rotated by a loadbalancer failover" - ) - try: - conn.load_balancer.failover_load_balancer(amphora.loadbalancer_id) - sleep(10) # wait for the octavia API + for amphora in amphorae: + rotate = False - done.append(amphora.loadbalancer_id) + if amphora.loadbalancer_id in done: + continue - wait_for_amphora_boot(conn, amphora.loadbalancer_id) - wait_for_amphora_delete(conn, amphora.loadbalancer_id) - except openstack.exceptions.ConflictException: - logger.warning( - f"Conflict while rotating loadbalancer {amphora.loadbalancer_id}, skipping" + duration = datetime.now(timezone.utc) - dateutil_parser.parse( + amphora.created_at + ) + if duration.total_seconds() > DEFAULT_ROTATION_AGE_SECONDS: + logger.info(f"Amphora {amphora.id} is older than 30 days") + rotate = True + elif force: + logger.info(f"Force rotation of Amphora {amphora.id}") + rotate = True + else: + continue + + if rotate: + logger.info( + f"Amphora {amphora.id} of loadbalancer {amphora.loadbalancer_id} is rotated by a loadbalancer failover" ) + try: + conn.load_balancer.failover_load_balancer( + amphora.loadbalancer_id + ) + sleep(10) # wait for the octavia API + + done.append(amphora.loadbalancer_id) + + wait_for_amphora_boot(conn, amphora.loadbalancer_id) + wait_for_amphora_delete(conn, amphora.loadbalancer_id) + except openstack.exceptions.ConflictException: + logger.warning( + f"Conflict while rotating loadbalancer {amphora.loadbalancer_id}, skipping" + ) + finally: + cleanup_cloud_environment(temp_files, original_cwd) diff --git a/osism/commands/loadbalancer.py b/osism/commands/loadbalancer.py index 778f7162..41d58aa5 100644 --- a/osism/commands/loadbalancer.py +++ b/osism/commands/loadbalancer.py @@ -10,8 +10,10 @@ from tabulate import tabulate import yaml -from osism.commands import get_cloud_connection +import openstack + from osism.commands.octavia import wait_for_amphora_boot +from osism.tasks.openstack import cleanup_cloud_environment, setup_cloud_environment from osism.tasks.conductor.utils import get_vault @@ -154,13 +156,35 @@ def get_parser(self, prog_name): def take_action(self, parsed_args): status_type = parsed_args.status_type - conn = get_cloud_connection(parsed_args.cloud) + cloud = parsed_args.cloud + + temp_files, original_cwd, success = setup_cloud_environment(cloud) + if not success: + logger.error(f"Failed to setup cloud environment for '{cloud}'") + return 1 - result = [] - if status_type == "provisioning_status": - # List loadbalancers with problematic provisioning status - for status in ["PENDING_CREATE", "PENDING_UPDATE", "ERROR"]: - for lb in conn.load_balancer.load_balancers(provisioning_status=status): + try: + conn = openstack.connect(cloud=cloud) + + result = [] + if status_type == "provisioning_status": + # List loadbalancers with problematic provisioning status + for status in ["PENDING_CREATE", "PENDING_UPDATE", "ERROR"]: + for lb in conn.load_balancer.load_balancers( + provisioning_status=status + ): + result.append( + [ + lb.id, + lb.name, + lb.provisioning_status, + lb.operating_status, + lb.project_id, + ] + ) + else: + # List loadbalancers with ERROR operating status + for lb in conn.load_balancer.load_balancers(operating_status="ERROR"): result.append( [ lb.id, @@ -170,35 +194,25 @@ def take_action(self, parsed_args): lb.project_id, ] ) - else: - # List loadbalancers with ERROR operating status - for lb in conn.load_balancer.load_balancers(operating_status="ERROR"): - result.append( - [ - lb.id, - lb.name, - lb.provisioning_status, - lb.operating_status, - lb.project_id, - ] - ) - if result: - print( - tabulate( - result, - headers=[ - "ID", - "Name", - "Provisioning Status", - "Operating Status", - "Project ID", - ], - tablefmt="psql", + if result: + print( + tabulate( + result, + headers=[ + "ID", + "Name", + "Provisioning Status", + "Operating Status", + "Project ID", + ], + tablefmt="psql", + ) ) - ) - else: - logger.info("No loadbalancers with problematic status found") + else: + logger.info("No loadbalancers with problematic status found") + finally: + cleanup_cloud_environment(temp_files, original_cwd) class LoadbalancerReset(Command): @@ -243,76 +257,85 @@ def take_action(self, parsed_args): status_type = parsed_args.status_type yes = parsed_args.yes no_failover = parsed_args.no_failover + cloud = parsed_args.cloud - conn = get_cloud_connection(parsed_args.cloud) - - # Get loadbalancer details - try: - lb = conn.load_balancer.get_load_balancer(loadbalancer_id) - except Exception as exc: - logger.error(f"Failed to get loadbalancer {loadbalancer_id}: {exc}") + temp_files, original_cwd, success = setup_cloud_environment(cloud) + if not success: + logger.error(f"Failed to setup cloud environment for '{cloud}'") return 1 - logger.info( - f"Loadbalancer {lb.name} ({lb.id}): " - f"provisioning_status={lb.provisioning_status}, " - f"operating_status={lb.operating_status}" - ) + try: + conn = openstack.connect(cloud=cloud) - # Validate status - if status_type == "provisioning_status": - if lb.provisioning_status not in ["PENDING_UPDATE", "ERROR"]: - logger.error( - f"Loadbalancer {loadbalancer_id} has provisioning_status " - f"'{lb.provisioning_status}', expected PENDING_UPDATE or ERROR. " - f"Use 'manage loadbalancer delete' for PENDING_CREATE status." - ) - return 1 - else: - if lb.operating_status != "ERROR": - logger.error( - f"Loadbalancer {loadbalancer_id} has operating_status " - f"'{lb.operating_status}', expected ERROR" - ) - return 1 - if lb.provisioning_status != "ACTIVE": - logger.error( - f"Loadbalancer {loadbalancer_id} has provisioning_status " - f"'{lb.provisioning_status}', expected ACTIVE for operating_status reset" - ) + # Get loadbalancer details + try: + lb = conn.load_balancer.get_load_balancer(loadbalancer_id) + except Exception as exc: + logger.error(f"Failed to get loadbalancer {loadbalancer_id}: {exc}") return 1 - # Confirm action - if not yes: - answer = prompt(f"Reset loadbalancer {lb.name} ({lb.id}) [yes/no]: ") - if answer.lower() not in ["yes", "y"]: - logger.info("Aborted") - return 0 - - # Connect to database - database = _get_octavia_database_connection() - if database is None: - return 1 + logger.info( + f"Loadbalancer {lb.name} ({lb.id}): " + f"provisioning_status={lb.provisioning_status}, " + f"operating_status={lb.operating_status}" + ) - try: - # Reset status in database - logger.info(f"Resetting {status_type} for {lb.name}") + # Validate status if status_type == "provisioning_status": - _reset_provisioning_status(database, lb.id) + if lb.provisioning_status not in ["PENDING_UPDATE", "ERROR"]: + logger.error( + f"Loadbalancer {loadbalancer_id} has provisioning_status " + f"'{lb.provisioning_status}', expected PENDING_UPDATE or ERROR. " + f"Use 'manage loadbalancer delete' for PENDING_CREATE status." + ) + return 1 else: - _reset_operating_status(database, lb.id) - - # Trigger failover - if not no_failover: - logger.info(f"Triggering failover for {lb.name}") - conn.load_balancer.failover_load_balancer(lb.id) - sleep(10) # wait for the octavia API - wait_for_amphora_boot(conn, lb.id) - - logger.info(f"Successfully reset loadbalancer {lb.name}") + if lb.operating_status != "ERROR": + logger.error( + f"Loadbalancer {loadbalancer_id} has operating_status " + f"'{lb.operating_status}', expected ERROR" + ) + return 1 + if lb.provisioning_status != "ACTIVE": + logger.error( + f"Loadbalancer {loadbalancer_id} has provisioning_status " + f"'{lb.provisioning_status}', expected ACTIVE for operating_status reset" + ) + return 1 + + # Confirm action + if not yes: + answer = prompt(f"Reset loadbalancer {lb.name} ({lb.id}) [yes/no]: ") + if answer.lower() not in ["yes", "y"]: + logger.info("Aborted") + return 0 + + # Connect to database + database = _get_octavia_database_connection() + if database is None: + return 1 + try: + # Reset status in database + logger.info(f"Resetting {status_type} for {lb.name}") + if status_type == "provisioning_status": + _reset_provisioning_status(database, lb.id) + else: + _reset_operating_status(database, lb.id) + + # Trigger failover + if not no_failover: + logger.info(f"Triggering failover for {lb.name}") + conn.load_balancer.failover_load_balancer(lb.id) + sleep(10) # wait for the octavia API + wait_for_amphora_boot(conn, lb.id) + + logger.info(f"Successfully reset loadbalancer {lb.name}") + + finally: + database.close() finally: - database.close() + cleanup_cloud_environment(temp_files, original_cwd) class LoadbalancerDelete(Command): @@ -342,53 +365,62 @@ def get_parser(self, prog_name): def take_action(self, parsed_args): loadbalancer_id = parsed_args.loadbalancer yes = parsed_args.yes + cloud = parsed_args.cloud - conn = get_cloud_connection(parsed_args.cloud) + temp_files, original_cwd, success = setup_cloud_environment(cloud) + if not success: + logger.error(f"Failed to setup cloud environment for '{cloud}'") + return 1 - # Get loadbalancer details try: - lb = conn.load_balancer.get_load_balancer(loadbalancer_id) - except Exception as exc: - logger.error(f"Failed to get loadbalancer {loadbalancer_id}: {exc}") - return 1 + conn = openstack.connect(cloud=cloud) - logger.info( - f"Loadbalancer {lb.name} ({lb.id}): " - f"provisioning_status={lb.provisioning_status}, " - f"operating_status={lb.operating_status}" - ) + # Get loadbalancer details + try: + lb = conn.load_balancer.get_load_balancer(loadbalancer_id) + except Exception as exc: + logger.error(f"Failed to get loadbalancer {loadbalancer_id}: {exc}") + return 1 - # Validate status - if lb.provisioning_status != "PENDING_CREATE": - logger.error( - f"Loadbalancer {loadbalancer_id} has provisioning_status " - f"'{lb.provisioning_status}', expected PENDING_CREATE. " - f"Use 'manage loadbalancer reset' for other status values." + logger.info( + f"Loadbalancer {lb.name} ({lb.id}): " + f"provisioning_status={lb.provisioning_status}, " + f"operating_status={lb.operating_status}" ) - return 1 - # Confirm action - if not yes: - answer = prompt(f"Delete loadbalancer {lb.name} ({lb.id}) [yes/no]: ") - if answer.lower() not in ["yes", "y"]: - logger.info("Aborted") - return 0 + # Validate status + if lb.provisioning_status != "PENDING_CREATE": + logger.error( + f"Loadbalancer {loadbalancer_id} has provisioning_status " + f"'{lb.provisioning_status}', expected PENDING_CREATE. " + f"Use 'manage loadbalancer reset' for other status values." + ) + return 1 - # Connect to database - database = _get_octavia_database_connection() - if database is None: - return 1 + # Confirm action + if not yes: + answer = prompt(f"Delete loadbalancer {lb.name} ({lb.id}) [yes/no]: ") + if answer.lower() not in ["yes", "y"]: + logger.info("Aborted") + return 0 - try: - # Set status to ERROR first so delete works - logger.info(f"Setting provisioning_status to ERROR for {lb.name}") - _reset_provisioning_status(database, lb.id, status="ERROR") + # Connect to database + database = _get_octavia_database_connection() + if database is None: + return 1 + + try: + # Set status to ERROR first so delete works + logger.info(f"Setting provisioning_status to ERROR for {lb.name}") + _reset_provisioning_status(database, lb.id, status="ERROR") - # Delete loadbalancer - logger.info(f"Deleting loadbalancer {lb.name}") - conn.load_balancer.delete_load_balancer(lb.id) + # Delete loadbalancer + logger.info(f"Deleting loadbalancer {lb.name}") + conn.load_balancer.delete_load_balancer(lb.id) - logger.info(f"Successfully deleted loadbalancer {lb.name}") + logger.info(f"Successfully deleted loadbalancer {lb.name}") + finally: + database.close() finally: - database.close() + cleanup_cloud_environment(temp_files, original_cwd) diff --git a/osism/commands/server.py b/osism/commands/server.py index 4b7f7c51..6aee3d1b 100644 --- a/osism/commands/server.py +++ b/osism/commands/server.py @@ -6,15 +6,22 @@ from cliff.command import Command import dateutil from loguru import logger -from tabulate import tabulate +import openstack from prompt_toolkit import prompt +from tabulate import tabulate -from osism.commands import get_cloud_connection +from osism.tasks.openstack import cleanup_cloud_environment, setup_cloud_environment class ServerMigrate(Command): def get_parser(self, prog_name): parser = super(ServerMigrate, self).get_parser(prog_name) + parser.add_argument( + "--cloud", + type=str, + help="Cloud name in clouds.yaml", + default="admin", + ) parser.add_argument( "--yes", default=False, @@ -48,46 +55,57 @@ def get_parser(self, prog_name): return parser def take_action(self, parsed_args): + cloud = parsed_args.cloud yes = parsed_args.yes instance = parsed_args.instance[0] target = parsed_args.target force = parsed_args.force no_wait = parsed_args.no_wait - conn = get_cloud_connection() - - result = conn.compute.get_server(instance) - server = [result.id, result.name, result.status] - - if server[2] not in ["ACTIVE", "PAUSED"]: - logger.info( - f"{server[0]} ({server[1]}) in status {server[2]} cannot be live migrated" - ) - return - - if yes: - answer = "yes" - else: - answer = prompt(f"Live migrate server {server[0]} ({server[1]}) [yes/no]: ") - - if answer in ["yes", "y"]: - logger.info(f"Live migrating server {server[0]}") - conn.compute.live_migrate_server( - server[0], host=target, block_migration="auto", force=force - ) - - if not no_wait: - inner_wait = True - while inner_wait: - time.sleep(2) - s = conn.compute.get_server(server[0]) - if s.status in ["MIGRATING"]: - logger.info( - f"Live migration of {server[0]} ({server[1]}) is still in progress" - ) - inner_wait = True - else: - inner_wait = False + temp_files, original_cwd, success = setup_cloud_environment(cloud) + if not success: + logger.error(f"Failed to setup cloud environment for '{cloud}'") + return 1 + + try: + conn = openstack.connect(cloud=cloud) + + result = conn.compute.get_server(instance) + server = [result.id, result.name, result.status] + + if server[2] not in ["ACTIVE", "PAUSED"]: + logger.info( + f"{server[0]} ({server[1]}) in status {server[2]} cannot be live migrated" + ) + return + + if yes: + answer = "yes" + else: + answer = prompt( + f"Live migrate server {server[0]} ({server[1]}) [yes/no]: " + ) + + if answer in ["yes", "y"]: + logger.info(f"Live migrating server {server[0]}") + conn.compute.live_migrate_server( + server[0], host=target, block_migration="auto", force=force + ) + + if not no_wait: + inner_wait = True + while inner_wait: + time.sleep(2) + s = conn.compute.get_server(server[0]) + if s.status in ["MIGRATING"]: + logger.info( + f"Live migration of {server[0]} ({server[1]}) is still in progress" + ) + inner_wait = True + else: + inner_wait = False + finally: + cleanup_cloud_environment(temp_files, original_cwd) class ServerList(Command): @@ -130,168 +148,109 @@ def get_parser(self, prog_name): def take_action(self, parsed_args): cloud = parsed_args.cloud - conn = get_cloud_connection(cloud) domain = parsed_args.domain project = parsed_args.project project_domain = parsed_args.project_domain user = parsed_args.user user_domain = parsed_args.user_domain - result = [] + temp_files, original_cwd, success = setup_cloud_environment(cloud) + if not success: + logger.error(f"Failed to setup cloud environment for '{cloud}'") + return 1 - # Handle user lookup if --user is specified - user_id = None - if user: - user_query = {} + try: + conn = openstack.connect(cloud=cloud) - if user_domain: - u_d = conn.identity.find_domain(user_domain, ignore_missing=True) - if u_d and "id" in u_d: - user_query = dict(domain_id=u_d.id) - else: - logger.error(f"No domain found for {user_domain}") - return + result = [] - u = conn.identity.find_user(user, ignore_missing=True, **user_query) - if u and "id" in u: - user_id = u.id - else: - logger.error(f"No user found for {user}") - return + # Handle user lookup if --user is specified + user_id = None + if user: + user_query = {} - if domain: - _domain = conn.identity.find_domain(domain) - if not _domain: - logger.error(f"Domain {domain} not found") - return - projects = list(conn.identity.projects(domain_id=_domain.id)) - - for project in projects: - query = {"project_id": project.id} - for server in conn.compute.servers(all_projects=True, **query): - result.append( - [ - project.name, - project.id, - server.user_id if hasattr(server, "user_id") else None, - server.id, - server.name, - server.flavor["original_name"], - server.status, - ] - ) + if user_domain: + u_d = conn.identity.find_domain(user_domain, ignore_missing=True) + if u_d and "id" in u_d: + user_query = dict(domain_id=u_d.id) + else: + logger.error(f"No domain found for {user_domain}") + return - print( - tabulate( - result, - headers=[ - "Project", - "Project ID", - "User ID", - "ID", - "Name", - "Flavor", - "Status", - ], - tablefmt="psql", - ) - ) + u = conn.identity.find_user(user, ignore_missing=True, **user_query) + if u and "id" in u: + user_id = u.id + else: + logger.error(f"No user found for {user}") + return - elif project: - if project_domain: - _project_domain = conn.identity.find_domain(project_domain) - if not _project_domain: - logger.error(f"Project domain {project_domain} not found") + if domain: + _domain = conn.identity.find_domain(domain) + if not _domain: + logger.error(f"Domain {domain} not found") return - query = {"domain_id": _project_domain.id} - _project = conn.identity.find_project(project, **query) - else: - _project = conn.identity.find_project(project) - if not _project: - logger.error(f"Project {project} not found") - return - query = {"project_id": _project.id} - - # Get domain name from project - domain_name = None - if hasattr(_project, "domain_id") and _project.domain_id: - try: - _domain = conn.identity.get_domain(_project.domain_id) - domain_name = _domain.name if _domain else _project.domain_id - except Exception: - domain_name = _project.domain_id - - for server in conn.compute.servers(all_projects=True, **query): - result.append( - [ - domain_name, - server.user_id if hasattr(server, "user_id") else None, - server.id, - server.name, - server.flavor["original_name"], - server.status, - ] - ) + projects = list(conn.identity.projects(domain_id=_domain.id)) + + for project in projects: + query = {"project_id": project.id} + for server in conn.compute.servers(all_projects=True, **query): + result.append( + [ + project.name, + project.id, + server.user_id if hasattr(server, "user_id") else None, + server.id, + server.name, + server.flavor["original_name"], + server.status, + ] + ) - print( - tabulate( - result, - headers=["Domain", "User ID", "ID", "Name", "Flavor", "Status"], - tablefmt="psql", + print( + tabulate( + result, + headers=[ + "Project", + "Project ID", + "User ID", + "ID", + "Name", + "Flavor", + "Status", + ], + tablefmt="psql", + ) ) - ) - elif user_id: - query = {"user_id": user_id} + elif project: + if project_domain: + _project_domain = conn.identity.find_domain(project_domain) + if not _project_domain: + logger.error(f"Project domain {project_domain} not found") + return + query = {"domain_id": _project_domain.id} + _project = conn.identity.find_project(project, **query) + else: + _project = conn.identity.find_project(project) + if not _project: + logger.error(f"Project {project} not found") + return + query = {"project_id": _project.id} - for server in conn.compute.servers(all_projects=True, **query): # Get domain name from project domain_name = None - if hasattr(server, "project_id") and server.project_id: + if hasattr(_project, "domain_id") and _project.domain_id: try: - _project = conn.identity.get_project(server.project_id) - if ( - _project - and hasattr(_project, "domain_id") - and _project.domain_id - ): - _domain = conn.identity.get_domain(_project.domain_id) - domain_name = ( - _domain.name if _domain else _project.domain_id - ) + _domain = conn.identity.get_domain(_project.domain_id) + domain_name = _domain.name if _domain else _project.domain_id except Exception: - domain_name = None - - result.append( - [ - domain_name, - server.project_id if hasattr(server, "project_id") else None, - server.id, - server.name, - server.flavor["original_name"], - server.status, - ] - ) - - print( - tabulate( - result, - headers=["Domain", "Project ID", "ID", "Name", "Flavor", "Status"], - tablefmt="psql", - ) - ) + domain_name = _project.domain_id - else: - for server in conn.compute.servers(all_projects=True, status="build"): - duration = datetime.now(timezone.utc) - dateutil.parser.parse( - server.created_at - ) - if duration.total_seconds() > 7200: - logger.info( - f"Server {server.id} hangs in BUILD status for more than 2 hours" - ) + for server in conn.compute.servers(all_projects=True, **query): result.append( [ + domain_name, + server.user_id if hasattr(server, "user_id") else None, server.id, server.name, server.flavor["original_name"], @@ -299,16 +258,50 @@ def take_action(self, parsed_args): ] ) - for server in conn.compute.servers(all_projects=True, status="error"): - duration = datetime.now(timezone.utc) - dateutil.parser.parse( - server.created_at - ) - if duration.total_seconds() > 7200: - logger.info( - f"Server {server.id} hangs in ERRORstatus for more than 2 hours" + print( + tabulate( + result, + headers=[ + "Domain", + "User ID", + "ID", + "Name", + "Flavor", + "Status", + ], + tablefmt="psql", ) + ) + + elif user_id: + query = {"user_id": user_id} + + for server in conn.compute.servers(all_projects=True, **query): + # Get domain name from project + domain_name = None + if hasattr(server, "project_id") and server.project_id: + try: + _project = conn.identity.get_project(server.project_id) + if ( + _project + and hasattr(_project, "domain_id") + and _project.domain_id + ): + _domain = conn.identity.get_domain(_project.domain_id) + domain_name = ( + _domain.name if _domain else _project.domain_id + ) + except Exception: + domain_name = None + result.append( [ + domain_name, + ( + server.project_id + if hasattr(server, "project_id") + else None + ), server.id, server.name, server.flavor["original_name"], @@ -316,13 +309,65 @@ def take_action(self, parsed_args): ] ) - print( - tabulate( - result, - headers=["ID", "Name", "Flavor", "Status"], - tablefmt="psql", + print( + tabulate( + result, + headers=[ + "Domain", + "Project ID", + "ID", + "Name", + "Flavor", + "Status", + ], + tablefmt="psql", + ) ) - ) + + else: + for server in conn.compute.servers(all_projects=True, status="build"): + duration = datetime.now(timezone.utc) - dateutil.parser.parse( + server.created_at + ) + if duration.total_seconds() > 7200: + logger.info( + f"Server {server.id} hangs in BUILD status for more than 2 hours" + ) + result.append( + [ + server.id, + server.name, + server.flavor["original_name"], + server.status, + ] + ) + + for server in conn.compute.servers(all_projects=True, status="error"): + duration = datetime.now(timezone.utc) - dateutil.parser.parse( + server.created_at + ) + if duration.total_seconds() > 7200: + logger.info( + f"Server {server.id} hangs in ERRORstatus for more than 2 hours" + ) + result.append( + [ + server.id, + server.name, + server.flavor["original_name"], + server.status, + ] + ) + + print( + tabulate( + result, + headers=["ID", "Name", "Flavor", "Status"], + tablefmt="psql", + ) + ) + finally: + cleanup_cloud_environment(temp_files, original_cwd) class ServerClean(Command): @@ -353,18 +398,37 @@ def take_action(self, parsed_args): yes = parsed_args.yes build_timeout = parsed_args.build_timeout - conn = get_cloud_connection(cloud) + temp_files, original_cwd, success = setup_cloud_environment(cloud) + if not success: + logger.error(f"Failed to setup cloud environment for '{cloud}'") + return 1 - # Handle servers stuck in BUILD status - for server in conn.compute.servers(all_projects=True, status="build"): - duration = datetime.now(timezone.utc) - dateutil.parser.parse( - server.created_at - ) - if duration.total_seconds() > build_timeout: - logger.info( - f"Server {server.id} ({server.name}) stuck in BUILD status " - f"for more than {build_timeout // 3600} hours" + try: + conn = openstack.connect(cloud=cloud) + + # Handle servers stuck in BUILD status + for server in conn.compute.servers(all_projects=True, status="build"): + duration = datetime.now(timezone.utc) - dateutil.parser.parse( + server.created_at ) + if duration.total_seconds() > build_timeout: + logger.info( + f"Server {server.id} ({server.name}) stuck in BUILD status " + f"for more than {build_timeout // 3600} hours" + ) + + if yes: + answer = "yes" + else: + answer = prompt(f"Delete server {server.id} [yes/no]: ") + + if answer in ["yes", "y"]: + logger.info(f"Deleting server {server.id}") + conn.compute.delete_server(server.id, force=True) + + # Handle servers in ERROR status + for server in conn.compute.servers(all_projects=True, status="error"): + logger.info(f"Server {server.id} ({server.name}) is in ERROR status") if yes: answer = "yes" @@ -374,16 +438,5 @@ def take_action(self, parsed_args): if answer in ["yes", "y"]: logger.info(f"Deleting server {server.id}") conn.compute.delete_server(server.id, force=True) - - # Handle servers in ERROR status - for server in conn.compute.servers(all_projects=True, status="error"): - logger.info(f"Server {server.id} ({server.name}) is in ERROR status") - - if yes: - answer = "yes" - else: - answer = prompt(f"Delete server {server.id} [yes/no]: ") - - if answer in ["yes", "y"]: - logger.info(f"Deleting server {server.id}") - conn.compute.delete_server(server.id, force=True) + finally: + cleanup_cloud_environment(temp_files, original_cwd) diff --git a/osism/commands/volume.py b/osism/commands/volume.py index 0141b927..730b33df 100644 --- a/osism/commands/volume.py +++ b/osism/commands/volume.py @@ -6,11 +6,12 @@ from cliff.command import Command import dateutil from loguru import logger +import openstack from prompt_toolkit import prompt import pytz from tabulate import tabulate -from osism.commands import get_cloud_connection +from osism.tasks.openstack import cleanup_cloud_environment, setup_cloud_environment # Time threshold for stuck volumes (2 hours in seconds) STUCK_VOLUME_THRESHOLD_SECONDS = 7200 @@ -47,26 +48,75 @@ def get_parser(self, prog_name): def take_action(self, parsed_args): cloud = parsed_args.cloud - conn = get_cloud_connection(cloud) domain = parsed_args.domain project = parsed_args.project project_domain = parsed_args.project_domain - result = [] - if domain: - _domain = conn.identity.find_domain(domain) - if not _domain: - logger.error(f"Domain {domain} not found") - return - projects = list(conn.identity.projects(domain_id=_domain.id)) + temp_files, original_cwd, success = setup_cloud_environment(cloud) + if not success: + logger.error(f"Failed to setup cloud environment for '{cloud}'") + return 1 + + try: + conn = openstack.connect(cloud=cloud) + + result = [] + if domain: + _domain = conn.identity.find_domain(domain) + if not _domain: + logger.error(f"Domain {domain} not found") + return + projects = list(conn.identity.projects(domain_id=_domain.id)) + + for project in projects: + query = {"project_id": project.id} + for volume in conn.block_storage.volumes( + all_projects=True, **query + ): + result.append( + [ + project.name, + project.id, + volume.id, + volume.name, + volume.volume_type, + volume.status, + ] + ) + + print( + tabulate( + result, + headers=[ + "Project", + "Project ID", + "ID", + "Name", + "Type", + "Status", + ], + tablefmt="psql", + ) + ) + + elif project: + if project_domain: + _project_domain = conn.identity.find_domain(project_domain) + if not _project_domain: + logger.error(f"Project domain {project_domain} not found") + return + query = {"domain_id": _project_domain.id} + _project = conn.identity.find_project(project, **query) + else: + _project = conn.identity.find_project(project) + if not _project: + logger.error(f"Project {project} not found") + return + query = {"project_id": _project.id} - for project in projects: - query = {"project_id": project.id} for volume in conn.block_storage.volumes(all_projects=True, **query): result.append( [ - project.name, - project.id, volume.id, volume.name, volume.volume_type, @@ -74,118 +124,99 @@ def take_action(self, parsed_args): ] ) - print( - tabulate( - result, - headers=["Project", "Project ID", "ID", "Name", "Type", "Status"], - tablefmt="psql", + print( + tabulate( + result, + headers=["ID", "Name", "Type", "Status"], + tablefmt="psql", + ) ) - ) - elif project: - if project_domain: - _project_domain = conn.identity.find_domain(project_domain) - if not _project_domain: - logger.error(f"Project domain {project_domain} not found") - return - query = {"domain_id": _project_domain.id} - _project = conn.identity.find_project(project, **query) else: - _project = conn.identity.find_project(project) - if not _project: - logger.error(f"Project {project} not found") - return - query = {"project_id": _project.id} - - for volume in conn.block_storage.volumes(all_projects=True, **query): - result.append( - [ - volume.id, - volume.name, - volume.volume_type, - volume.status, - ] - ) - - print( - tabulate( - result, - headers=["ID", "Name", "Type", "Status"], - tablefmt="psql", - ) - ) - - else: - for volume in conn.block_storage.volumes( - all_projects=True, status="detaching" - ): - created_at = pytz.utc.localize(dateutil.parser.parse(volume.created_at)) - duration = datetime.now(timezone.utc) - created_at - if duration.total_seconds() > 7200: - logger.info( - f"Volume {volume.id} hangs in DETACHING status for more than 2 hours" - ) - result.append( - [volume.id, volume.name, volume.volume_type, volume.status] + for volume in conn.block_storage.volumes( + all_projects=True, status="detaching" + ): + created_at = pytz.utc.localize( + dateutil.parser.parse(volume.created_at) ) + duration = datetime.now(timezone.utc) - created_at + if duration.total_seconds() > 7200: + logger.info( + f"Volume {volume.id} hangs in DETACHING status for more than 2 hours" + ) + result.append( + [volume.id, volume.name, volume.volume_type, volume.status] + ) - for volume in conn.block_storage.volumes( - all_projects=True, status="creating" - ): - created_at = pytz.utc.localize(dateutil.parser.parse(volume.created_at)) - duration = datetime.now(timezone.utc) - created_at - if duration.total_seconds() > 7200: - logger.info( - f"Volume {volume.id} hangs in CREATING status for more than 2 hours" - ) - result.append( - [volume.id, volume.name, volume.volume_type, volume.status] + for volume in conn.block_storage.volumes( + all_projects=True, status="creating" + ): + created_at = pytz.utc.localize( + dateutil.parser.parse(volume.created_at) ) + duration = datetime.now(timezone.utc) - created_at + if duration.total_seconds() > 7200: + logger.info( + f"Volume {volume.id} hangs in CREATING status for more than 2 hours" + ) + result.append( + [volume.id, volume.name, volume.volume_type, volume.status] + ) - for volume in conn.block_storage.volumes( - all_projects=True, status="error_deleting" - ): - created_at = pytz.utc.localize(dateutil.parser.parse(volume.created_at)) - duration = datetime.now(timezone.utc) - created_at - if duration.total_seconds() > 7200: - logger.info( - f"Volume {volume.id} hangs in ERROR_DELETING status for more than 2 hours" - ) - result.append( - [volume.id, volume.name, volume.volume_type, volume.status] + for volume in conn.block_storage.volumes( + all_projects=True, status="error_deleting" + ): + created_at = pytz.utc.localize( + dateutil.parser.parse(volume.created_at) ) + duration = datetime.now(timezone.utc) - created_at + if duration.total_seconds() > 7200: + logger.info( + f"Volume {volume.id} hangs in ERROR_DELETING status for more than 2 hours" + ) + result.append( + [volume.id, volume.name, volume.volume_type, volume.status] + ) - for volume in conn.block_storage.volumes( - all_projects=True, status="deleting" - ): - created_at = pytz.utc.localize(dateutil.parser.parse(volume.created_at)) - duration = datetime.now(timezone.utc) - created_at - if duration.total_seconds() > 7200: - logger.info( - f"Volume {volume.id} hangs in DELETING status for more than 2 hours" - ) - result.append( - [volume.id, volume.name, volume.volume_type, volume.status] + for volume in conn.block_storage.volumes( + all_projects=True, status="deleting" + ): + created_at = pytz.utc.localize( + dateutil.parser.parse(volume.created_at) ) + duration = datetime.now(timezone.utc) - created_at + if duration.total_seconds() > 7200: + logger.info( + f"Volume {volume.id} hangs in DELETING status for more than 2 hours" + ) + result.append( + [volume.id, volume.name, volume.volume_type, volume.status] + ) - for volume in conn.block_storage.volumes(all_projects=True, status="error"): - created_at = pytz.utc.localize(dateutil.parser.parse(volume.created_at)) - duration = datetime.now(timezone.utc) - created_at - if duration.total_seconds() > 7200: - logger.info( - f"Volume {volume.id} hangs in ERROR status for more than 2 hours" - ) - result.append( - [volume.id, volume.name, volume.volume_type, volume.status] + for volume in conn.block_storage.volumes( + all_projects=True, status="error" + ): + created_at = pytz.utc.localize( + dateutil.parser.parse(volume.created_at) ) + duration = datetime.now(timezone.utc) - created_at + if duration.total_seconds() > 7200: + logger.info( + f"Volume {volume.id} hangs in ERROR status for more than 2 hours" + ) + result.append( + [volume.id, volume.name, volume.volume_type, volume.status] + ) - print( - tabulate( - result, - headers=["ID", "Name", "Type", "Status"], - tablefmt="psql", + print( + tabulate( + result, + headers=["ID", "Name", "Type", "Status"], + tablefmt="psql", + ) ) - ) + finally: + cleanup_cloud_environment(temp_files, original_cwd) class VolumeRepair(Command): @@ -218,67 +249,85 @@ def take_action(self, parsed_args): cloud = parsed_args.cloud auto_confirm = parsed_args.yes - conn = get_cloud_connection(cloud) + temp_files, original_cwd, success = setup_cloud_environment(cloud) + if not success: + logger.error(f"Failed to setup cloud environment for '{cloud}'") + return 1 - # Handle volumes stuck in DETACHING state - for volume in conn.block_storage.volumes(all_projects=True, status="detaching"): - created_at = pytz.utc.localize(dateutil.parser.parse(volume.created_at)) - duration = datetime.now(timezone.utc) - created_at - if duration.total_seconds() > STUCK_VOLUME_THRESHOLD_SECONDS: - logger.info( - f"Volume {volume.id} hangs in DETACHING status for more than 2 hours" - ) - logger.info(f"Aborting detach of attachment(s) of volume {volume.id}") - conn.block_storage.abort_volume_detaching(volume.id) - - # Handle volumes stuck in CREATING state - for volume in conn.block_storage.volumes(all_projects=True, status="creating"): - created_at = pytz.utc.localize(dateutil.parser.parse(volume.created_at)) - duration = datetime.now(timezone.utc) - created_at - if duration.total_seconds() > STUCK_VOLUME_THRESHOLD_SECONDS: - logger.info( - f"Volume {volume.id} hangs in CREATING status for more than 2 hours" - ) + try: + conn = openstack.connect(cloud=cloud) + + # Handle volumes stuck in DETACHING state + for volume in conn.block_storage.volumes( + all_projects=True, status="detaching" + ): + created_at = pytz.utc.localize(dateutil.parser.parse(volume.created_at)) + duration = datetime.now(timezone.utc) - created_at + if duration.total_seconds() > STUCK_VOLUME_THRESHOLD_SECONDS: + logger.info( + f"Volume {volume.id} hangs in DETACHING status for more than 2 hours" + ) + logger.info( + f"Aborting detach of attachment(s) of volume {volume.id}" + ) + conn.block_storage.abort_volume_detaching(volume.id) + + # Handle volumes stuck in CREATING state + for volume in conn.block_storage.volumes( + all_projects=True, status="creating" + ): + created_at = pytz.utc.localize(dateutil.parser.parse(volume.created_at)) + duration = datetime.now(timezone.utc) - created_at + if duration.total_seconds() > STUCK_VOLUME_THRESHOLD_SECONDS: + logger.info( + f"Volume {volume.id} hangs in CREATING status for more than 2 hours" + ) + if auto_confirm: + result = "yes" + else: + result = prompt(f"Delete volume {volume.id} [yes/no]: ") + if result == "yes": + logger.info(f"Deleting volume {volume.id}") + conn.block_storage.delete_volume(volume.id, force=True) + + # Handle volumes in ERROR_DELETING state + for volume in conn.block_storage.volumes( + all_projects=True, status="error_deleting" + ): + logger.info(f"Volume {volume.id} is in ERROR_DELETING status") if auto_confirm: result = "yes" else: - result = prompt(f"Delete volume {volume.id} [yes/no]: ") + result = prompt(f"Retry to delete volume {volume.id} [yes/no]: ") if result == "yes": logger.info(f"Deleting volume {volume.id}") conn.block_storage.delete_volume(volume.id, force=True) - # Handle volumes in ERROR_DELETING state - for volume in conn.block_storage.volumes( - all_projects=True, status="error_deleting" - ): - logger.info(f"Volume {volume.id} is in ERROR_DELETING status") - if auto_confirm: - result = "yes" - else: - result = prompt(f"Retry to delete volume {volume.id} [yes/no]: ") - if result == "yes": - logger.info(f"Deleting volume {volume.id}") - conn.block_storage.delete_volume(volume.id, force=True) - - # Handle volumes stuck in DELETING state - for volume in conn.block_storage.volumes(all_projects=True, status="deleting"): - created_at = pytz.utc.localize(dateutil.parser.parse(volume.created_at)) - duration = datetime.now(timezone.utc) - created_at - if duration.total_seconds() > STUCK_VOLUME_THRESHOLD_SECONDS: - logger.info( - f"Volume {volume.id} hangs in DELETING status for more than 2 hours" - ) - if auto_confirm: - result = "yes" - else: - result = prompt(f"Retry deletion of volume {volume.id} [yes/no]: ") - if result == "yes": - logger.info(f"Resetting and deleting volume {volume.id}") - conn.block_storage.reset_volume_status( - volume.id, - status="available", - attach_status=None, - migration_status=None, + # Handle volumes stuck in DELETING state + for volume in conn.block_storage.volumes( + all_projects=True, status="deleting" + ): + created_at = pytz.utc.localize(dateutil.parser.parse(volume.created_at)) + duration = datetime.now(timezone.utc) - created_at + if duration.total_seconds() > STUCK_VOLUME_THRESHOLD_SECONDS: + logger.info( + f"Volume {volume.id} hangs in DELETING status for more than 2 hours" ) - sleep(SLEEP_WAIT_FOR_API) - conn.block_storage.delete_volume(volume.id, force=True) + if auto_confirm: + result = "yes" + else: + result = prompt( + f"Retry deletion of volume {volume.id} [yes/no]: " + ) + if result == "yes": + logger.info(f"Resetting and deleting volume {volume.id}") + conn.block_storage.reset_volume_status( + volume.id, + status="available", + attach_status=None, + migration_status=None, + ) + sleep(SLEEP_WAIT_FOR_API) + conn.block_storage.delete_volume(volume.id, force=True) + finally: + cleanup_cloud_environment(temp_files, original_cwd)