diff --git a/.pylintrc b/.pylintrc index b229646e..cd09e2eb 100644 --- a/.pylintrc +++ b/.pylintrc @@ -50,7 +50,7 @@ confidence= # --enable=similarities". If you want to run only the classes checker, but have # no Warning level messages displayed, use"--disable=all --enable=classes # --disable=W" -disable=print-statement,parameter-unpacking,unpacking-in-except,old-raise-syntax,backtick,long-suffix,old-ne-operator,old-octal-literal,import-star-module-level,raw-checker-failed,bad-inline-option,locally-disabled,locally-enabled,file-ignored,suppressed-message,useless-suppression,deprecated-pragma,apply-builtin,basestring-builtin,buffer-builtin,cmp-builtin,coerce-builtin,execfile-builtin,file-builtin,long-builtin,raw_input-builtin,reduce-builtin,standarderror-builtin,unicode-builtin,xrange-builtin,coerce-method,delslice-method,getslice-method,setslice-method,no-absolute-import,old-division,dict-iter-method,dict-view-method,next-method-called,metaclass-assignment,indexing-exception,raising-string,reload-builtin,oct-method,hex-method,nonzero-method,cmp-method,input-builtin,round-builtin,intern-builtin,unichr-builtin,map-builtin-not-iterating,zip-builtin-not-iterating,range-builtin-not-iterating,filter-builtin-not-iterating,using-cmp-argument,eq-without-hash,div-method,idiv-method,rdiv-method,exception-message-attribute,invalid-str-codec,sys-max-int,bad-python3-import,deprecated-string-function,deprecated-str-translate-call,too-many-arguments +disable=print-statement,parameter-unpacking,unpacking-in-except,old-raise-syntax,backtick,long-suffix,old-ne-operator,old-octal-literal,import-star-module-level,raw-checker-failed,bad-inline-option,locally-disabled,locally-enabled,file-ignored,suppressed-message,useless-suppression,deprecated-pragma,apply-builtin,basestring-builtin,buffer-builtin,cmp-builtin,coerce-builtin,execfile-builtin,file-builtin,long-builtin,raw_input-builtin,reduce-builtin,standarderror-builtin,unicode-builtin,xrange-builtin,coerce-method,delslice-method,getslice-method,setslice-method,no-absolute-import,old-division,dict-iter-method,dict-view-method,next-method-called,metaclass-assignment,indexing-exception,raising-string,reload-builtin,oct-method,hex-method,nonzero-method,cmp-method,input-builtin,round-builtin,intern-builtin,unichr-builtin,map-builtin-not-iterating,zip-builtin-not-iterating,range-builtin-not-iterating,filter-builtin-not-iterating,using-cmp-argument,eq-without-hash,div-method,idiv-method,rdiv-method,exception-message-attribute,invalid-str-codec,sys-max-int,bad-python3-import,deprecated-string-function,deprecated-str-translate-call,too-many-arguments,duplicate-code # Enable the message, report, category or checker with the given id(s). You can # either give multiple identifier separated by comma (,) or put this option diff --git a/.vscode/cSpell.json b/.vscode/cSpell.json index bdf759f1..6802cd5f 100644 --- a/.vscode/cSpell.json +++ b/.vscode/cSpell.json @@ -9,6 +9,7 @@ "JMES", "Stateful", "Unmonitored", + "creds", "isdir", "isfile", "opencode", diff --git a/src/README.rst b/src/README.rst index d53fc626..bff8420a 100644 --- a/src/README.rst +++ b/src/README.rst @@ -16,6 +16,26 @@ To get started, after installation run the following: Change Log ========== +2.0.0 +----- + +- Update to official 6.0 Service Fabric SDK +- Report cluster health command added +- Report health commands now have an immediate argument to tell the Fabric + gateway to send the report immeditately +- Get cluster configuration and upgrade configuration for stand alone clusters + commands added +- Added start and update cluster upgrade commands +- Start node command removed (use enable node) +- Stop node command removed (use disable node) +- Added information about new Fabric name hierarchical delimiter (~) +- Health commands now include statistics, can be optionally removed +- Limited set of repair manager commands added +- Infrastructure service commands no longer accept a callback function +- Docker compose commands have had arguments renamed to reflect Service Fabric + API changes +- Added support to upgrade Docker compose deployments + 1.2.0rc2 -------- diff --git a/src/setup.py b/src/setup.py index fd36b88a..63fe3a47 100644 --- a/src/setup.py +++ b/src/setup.py @@ -17,7 +17,7 @@ def read(fname): setup( name='sfctl', - version='1.2.0rc2', + version='2.0.0', description='Azure Service Fabric command line', long_description=read('README.rst'), url='https://github.com/Azure/service-fabric-cli', @@ -46,7 +46,7 @@ def read(fname): 'knack==0.1.1', 'msrest>=0.4.4', 'requests', - 'sfctl-azure-servicefabric==6.0.0rc1', + 'azure-servicefabric==6.0', 'jsonpickle' ], extras_require={ diff --git a/src/sfctl/commands.py b/src/sfctl/commands.py index 6339fb3f..7a42095a 100644 --- a/src/sfctl/commands.py +++ b/src/sfctl/commands.py @@ -17,6 +17,9 @@ # Need to import so global help dict gets updated import sfctl.helps.app # pylint: disable=unused-import import sfctl.helps.main # pylint: disable=unused-import +import sfctl.helps.health # pylint: disable=unused-import +import sfctl.helps.cluster_upgrade # pylint: disable=unused-import +import sfctl.helps.compose # pylint: disable=unused-import class SFCommandHelp(CLIHelp): """Service Fabric CLI help loader""" @@ -36,6 +39,18 @@ def load_command_table(self, args): #pylint: disable=too-many-statements client_func_path = 'azure.servicefabric#ServiceFabricClientAPIs.{}' with CommandSuperGroup(__name__, self, client_func_path, client_factory=client_create) as super_group: + + with super_group.group('rpm') as group: + group.command('delete', 'delete_repair_task') + group.command('list', 'get_repair_task_list') + group.command('approve-force', 'force_approve_repair_task') + + + with super_group.group('sa-cluster') as group: + group.command('config', 'get_cluster_configuration') + group.command('upgrade-status', + 'get_cluster_configuration_upgrade_status') + with super_group.group('cluster') as group: group.command('health', 'get_cluster_health') group.command('manifest', 'get_cluster_manifest') @@ -51,6 +66,10 @@ def load_command_table(self, args): #pylint: disable=too-many-statements group.command('recover-system', 'recover_system_partitions') group.command('operation-list', 'get_fault_operation_list') group.command('operation-cancel', 'cancel_operation') + group.command('provision', 'provision_cluster') + group.command('unprovision', 'unprovision_cluster') + group.command('upgrade-rollback', 'rollback_cluster_upgrade') + group.command('upgrade-resume', 'resume_cluster_upgrade') with super_group.group('node') as group: group.command('list', 'get_node_info_list') @@ -60,8 +79,6 @@ def load_command_table(self, args): #pylint: disable=too-many-statements group.command('disable', 'disable_node') group.command('enable', 'enable_node') group.command('remove-state', 'remove_node_state') - group.command('start', 'start_node') - group.command('stop', 'stop_node') group.command('restart', 'restart_node') group.command('transition', 'start_node_transition') group.command( @@ -94,6 +111,7 @@ def load_command_table(self, args): #pylint: disable=too-many-statements 'get_deployed_application_health' ) group.command('manifest', 'get_application_manifest') + group.command('load', 'get_application_load_info') with super_group.group('service') as group: group.command('type-list', 'get_service_type_info_list') @@ -166,9 +184,11 @@ def load_command_table(self, args): #pylint: disable=too-many-statements group.command('remove', 'remove_replica') with super_group.group('compose') as group: - group.command('status', 'get_compose_application_status') - group.command('list', 'get_compose_application_status_list') - group.command('remove', 'remove_compose_application') + group.command('status', 'get_compose_deployment_status') + group.command('list', 'get_compose_deployment_status_list') + group.command('remove', 'remove_compose_deployment') + group.command('upgrade-status', + 'get_compose_deployment_upgrade_progress') with super_group.group('chaos') as group: group.command('stop', 'stop_chaos') @@ -185,10 +205,23 @@ def load_command_table(self, args): #pylint: disable=too-many-statements # Custom commands - with CommandSuperGroup(__name__, self, 'sfctl.custom_app#{}', + with CommandSuperGroup(__name__, self, + 'sfctl.custom_cluster_upgrade#{}', + client_factory=client_create) as super_group: + with super_group.group('cluster') as group: + group.command('upgrade', 'upgrade') + group.command('upgrade-update', 'update_upgrade') + with super_group.group('sa-cluster') as group: + group.command('config-upgrade', 'sa_configuration_upgrade') + + with CommandSuperGroup(__name__, self, 'sfctl.custom_compose#{}', client_factory=client_create) as super_group: with super_group.group('compose') as group: - group.command('create', 'create_compose_application') + group.command('upgrade', 'upgrade') + group.command('create', 'create') + + with CommandSuperGroup(__name__, self, 'sfctl.custom_app#{}', + client_factory=client_create) as super_group: with super_group.group('application') as group: group.command('create', 'create') group.command('upgrade', 'upgrade') @@ -220,6 +253,8 @@ def load_command_table(self, args): #pylint: disable=too-many-statements group.command('report-health', 'report_replica_health') with super_group.group('node') as group: group.command('report-health', 'report_node_health') + with super_group.group('cluster') as group: + group.command('report-health', 'report_cluster_health') with CommandSuperGroup(__name__, self, 'sfctl.custom_service#{}', client_factory=client_create) as super_group: diff --git a/src/sfctl/custom_app.py b/src/sfctl/custom_app.py index c6b72b23..4fa8d9b1 100644 --- a/src/sfctl/custom_app.py +++ b/src/sfctl/custom_app.py @@ -13,48 +13,6 @@ import shutil from knack.util import CLIError -def create_compose_application(client, compose_file, application_id, - repo_user=None, encrypted=False, - repo_pass=None, timeout=60): - """ - Creates a Service Fabric application from a Compose file - :param str application_id: The id of application to create from - Compose file. This is typically the full id of the application - including "fabric:" URI scheme - :param str compose_file: Path to the Compose file to use - :param str repo_user: Container repository user name if needed for - authentication - :param bool encrypted: If true, indicate to use an encrypted password - rather than prompting for a plaintext one - :param str repo_pass: Encrypted container repository password - """ - from azure.servicefabric.models.create_compose_application_description import CreateComposeApplicationDescription # pylint: disable=line-too-long - from azure.servicefabric.models.repository_credential import ( - RepositoryCredential - ) - from getpass import getpass - - if (any([encrypted, repo_pass]) and - not all([encrypted, repo_pass, repo_user])): - raise CLIError('Invalid credentials syntax') - - if repo_user and not repo_pass: - repo_pass = getpass('Repository password: ') - - repo_cred = RepositoryCredential(repo_user, repo_pass, encrypted) - - file_contents = None - with open(compose_file) as f_desc: - file_contents = f_desc.read() - if not file_contents: - raise CLIError('Could not read {}'.format(compose_file)) - - model = CreateComposeApplicationDescription(application_id, file_contents, - repo_cred) - - client.create_compose_application(model, timeout) - - def validate_app_path(app_path): """Validate and return application package as absolute path""" @@ -292,7 +250,7 @@ def create(client, # pylint: disable=too-many-locals,too-many-arguments client.create_application(app_desc, timeout) -def upgrade( # pylint: disable=too-many-arguments,too-many-locals +def upgrade( # pylint: disable=too-many-arguments,too-many-locals,missing-docstring client, app_id, app_version, parameters, mode="UnmonitoredAuto", replica_set_check_timeout=None, force_restart=None, failure_action=None, health_check_wait_duration="0", @@ -303,51 +261,6 @@ def upgrade( # pylint: disable=too-many-arguments,too-many-locals warning_as_error=False, max_unhealthy_apps=0, default_service_health_policy=None, service_health_policy=None, timeout=60): - """ - Starts upgrading an application in the Service Fabric cluster. - Validates the supplied application upgrade parameters and starts upgrading - the application if the parameters are valid. Please note that upgrade - description replaces the existing application description. This means that - if the parameters are not specified, the existing parameters on the - applications will be overwritten with the empty parameters list. This - would results in application using the default value of the parameters - from the application manifest. - :param str app_id: The identity of the application. This is typically the - full name of the application without the 'fabric:' URI scheme. - :param str app_version: The target application type version (found in the - application manifest) for the application upgrade. - :param str parameters: A JSON encoded list of application parameter - overrides to be applied when upgrading the application. - :param str mode: The mode used to monitor health during a rolling upgrade. - :param int replica_set_check_timeout: The maximum amount of time to block - processing of an upgrade domain and prevent loss of availability when - there are unexpected issues. Measured in seconds. - :param bool force_restart: Forcefully restart processes during upgrade even - when the code version has not changed. - :param str failure_action: The action to perform when a Monitored upgrade - encounters monitoring policy or health policy violations. - :param str health_check_wait_duration: The amount of time to wait after - completing an upgrade domain before applying health policies. Measured in - milliseconds. - :param str health_check_stable_duration: The amount of time that the - application or cluster must remain healthy before the upgrade proceeds - to the next upgrade domain. Measured in milliseconds. - :param str health_check_retry_timeout: The amount of time to retry health - evaluations when the application or cluster is unhealthy before the failure - action is executed. Measured in milliseconds. - :param str upgrade_timeout: The amount of time the overall upgrade has to - complete before FailureAction is executed. Measured in milliseconds. - :param str upgrade_domain_timeout: The amount of time each upgrade domain - has to complete before FailureAction is executed. Measured in milliseconds. - :param bool warning_as_error: Treat health evaluation warnings with the - same severity as errors. - :param int max_unhealthy_apps: The maximum allowed percentage of unhealthy - deployed applications. Represented as a number between 0 and 100. - :param str default_service_health_policy: JSON encoded specification of the - health policy used by default to evaluate the health of a service type. - :param str service_health_policy: JSON encoded map with service type health - policy per service type name. The map is empty be default. - """ from azure.servicefabric.models.application_upgrade_description import ( ApplicationUpgradeDescription ) diff --git a/src/sfctl/custom_cluster_upgrade.py b/src/sfctl/custom_cluster_upgrade.py new file mode 100644 index 00000000..33dcc1d1 --- /dev/null +++ b/src/sfctl/custom_cluster_upgrade.py @@ -0,0 +1,184 @@ +# ----------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for +# license information. +# ----------------------------------------------------------------------------- + +"""Custom cluster upgrade specific commands""" + +from knack.util import CLIError + +def create_monitoring_policy(failure_action, health_check_wait, + health_check_stable, health_check_retry, + upgrade_timeout, upgrade_domain_timeout): + """Create a monitoring policy description for an upgrade""" + from azure.servicefabric.models import MonitoringPolicyDescription + + if failure_action not in ['Invalid', 'Rollback', 'Manual', None]: + raise CLIError('Invalid upgrade failure action specified') + + if not any([failure_action, health_check_wait, health_check_stable, + health_check_retry, upgrade_timeout, upgrade_domain_timeout]): + return None + return MonitoringPolicyDescription(failure_action, health_check_wait, + health_check_stable, health_check_retry, + upgrade_timeout, upgrade_domain_timeout) + +def create_upgrade_health_policy(delta_unhealthy_nodes, + ud_delta_unhealthy_nodes): + """Create an upgrade node health policy""" + from azure.servicefabric.models import ClusterUpgradeHealthPolicyObject + + if not any([delta_unhealthy_nodes, ud_delta_unhealthy_nodes]): + return None + return ClusterUpgradeHealthPolicyObject( + delta_unhealthy_nodes, ud_delta_unhealthy_nodes + ) + +def create_cluster_health_policy(warning_as_error, unhealthy_nodes, + unhealthy_applications, + application_type_health_map): + """Create a cluster health policy for an upgrade""" + from azure.servicefabric.models import (ClusterHealthPolicy, + ApplicationTypeHealthPolicyMapItem) + + app_type_list = None + if application_type_health_map: + app_type_list = [] + for app_type in application_type_health_map: + allowed_unhealthy = application_type_health_map[app_type] + policy_item = ApplicationTypeHealthPolicyMapItem(app_type, + allowed_unhealthy) + app_type_list.append(policy_item) + + if not any([warning_as_error, unhealthy_nodes, unhealthy_applications, + app_type_list]): + return None + return ClusterHealthPolicy(warning_as_error, unhealthy_nodes, + unhealthy_applications, app_type_list) + +def parse_app_health_policy(app_health_map): + """From a complex object create a map of application health policies""" + from azure.servicefabric.models import (ApplicationHealthPolicyMapItem, + ApplicationHealthPolicies) + if not app_health_map: + return None + policy_list = [] + for app in app_health_map: + allowed_unhealthy = app_health_map[app] + policy_item = ApplicationHealthPolicyMapItem(app, allowed_unhealthy) + policy_list.append(policy_item) + + return ApplicationHealthPolicies(policy_list) + +def create_rolling_update_desc( + rolling_upgrade_mode, force_restart, replica_set_check_timeout, + failure_action, health_check_wait, health_check_stable, + health_check_retry, upgrade_timeout, upgrade_domain_timeout): + """Create an update description for an upgrade rolling mode""" + from azure.servicefabric.models import RollingUpgradeUpdateDescription + + return RollingUpgradeUpdateDescription( + rolling_upgrade_mode=rolling_upgrade_mode, + force_restart=force_restart, + replica_set_check_timeout_in_milliseconds=replica_set_check_timeout, #pylint: disable=line-too-long + failure_action=failure_action, + health_check_wait_duration_in_milliseconds=health_check_wait, + health_check_stable_duration_in_milliseconds=health_check_stable, + health_check_retry_timeout_in_milliseconds=health_check_retry, + upgrade_domain_timeout_in_milliseconds=upgrade_domain_timeout, + upgrade_timeout_in_milliseconds=upgrade_timeout) + +def upgrade( #pylint: disable=too-many-locals,missing-docstring,invalid-name + client, code_version=None, config_version=None, + rolling_upgrade_mode='UnmonitoredAuto', replica_set_check_timeout=None, + force_restart=False, failure_action=None, health_check_wait=None, + health_check_stable=None, health_check_retry=None, + upgrade_timeout=None, upgrade_domain_timeout=None, + warning_as_error=False, unhealthy_nodes=0, unhealthy_applications=0, + app_type_health_map=None, delta_health_evaluation=False, + delta_unhealthy_nodes=10, upgrade_domain_delta_unhealthy_nodes=15, + app_health_map=None, timeout=60): + from azure.servicefabric.models import StartClusterUpgradeDescription + + mon_policy = create_monitoring_policy(failure_action, health_check_wait, + health_check_stable, + health_check_retry, upgrade_timeout, + upgrade_domain_timeout) + cluster_policy = create_cluster_health_policy( + warning_as_error, unhealthy_nodes, unhealthy_applications, + app_type_health_map + ) + cluster_upgrade_policy = create_upgrade_health_policy( + delta_unhealthy_nodes, upgrade_domain_delta_unhealthy_nodes) + app_health_policy = parse_app_health_policy(app_health_map) + + upgrade_desc = StartClusterUpgradeDescription( + code_version=code_version, config_version=config_version, + upgrade_kind='Rolling', rolling_upgrade_mode=rolling_upgrade_mode, + upgrade_replica_set_check_timeout_in_seconds=replica_set_check_timeout, + force_restart=force_restart, monitoring_policy=mon_policy, + cluster_health_policy=cluster_policy, + enable_delta_health_evaluation=delta_health_evaluation, + cluster_upgrade_health_policy=cluster_upgrade_policy, + application_health_policy_map=app_health_policy) + + client.start_cluster_upgrade(upgrade_desc, timeout=timeout) + +def sa_configuration_upgrade( #pylint: disable=missing-docstring,invalid-name + client, cluster_config, health_check_retry='PT0H0M0S', + health_check_wait='PT0H0M0S', health_check_stable='PT0H0M0S', + upgrade_domain_timeout='PT0H0M0S', upgrade_timeout='PT0H0M0S', + unhealthy_applications=0, unhealthy_nodes=0, delta_unhealthy_nodes=0, + upgrade_domain_delta_unhealthy_nodes=0, timeout=60): + from azure.servicefabric.models import \ + ClusterConfigurationUpgradeDescription + + upgrade_desc = ClusterConfigurationUpgradeDescription( + cluster_config, health_check_retry_timeout=health_check_retry, + health_check_wait_duration_in_seconds=health_check_wait, + health_check_stable_duration_in_seconds=health_check_stable, + upgrade_domain_timeout_in_seconds=upgrade_domain_timeout, + upgrade_timeout_in_seconds=upgrade_timeout, + max_percent_unhealthy_applications=unhealthy_applications, + max_percent_unhealthy_nodes=unhealthy_nodes, + max_percent_delta_unhealthy_nodes=delta_unhealthy_nodes, + max_percent_upgrade_domain_delta_unhealthy_nodes=upgrade_domain_delta_unhealthy_nodes) #pylint: disable=line-too-long + + client.start_cluster_configuration_upgrade(upgrade_desc, timeout=timeout) + +def update_upgrade( #pylint: disable=too-many-locals,missing-docstring,invalid-name + client, upgrade_kind='Rolling', rolling_upgrade_mode='UnmonitoredAuto', + replica_set_check_timeout=None, force_restart=False, + failure_action=None, health_check_wait=None, health_check_stable=None, + health_check_retry=None, upgrade_timeout=None, + upgrade_domain_timeout=None, warning_as_error=False, unhealthy_nodes=0, + unhealthy_applications=0, app_type_health_map=None, + delta_health_evaluation=False, delta_unhealthy_nodes=10, + upgrade_domain_delta_unhealthy_nodes=15, app_health_map=None, + timeout=60): + from azure.servicefabric.models import UpdateClusterUpgradeDescription + + rolling_desc = create_rolling_update_desc( + rolling_upgrade_mode, force_restart, replica_set_check_timeout, + failure_action, health_check_wait, health_check_stable, + health_check_retry, upgrade_timeout, upgrade_domain_timeout + ) + health_policy = create_cluster_health_policy( + warning_as_error, unhealthy_nodes, unhealthy_applications, + app_type_health_map + ) + upgrade_health_policy = create_upgrade_health_policy( + delta_unhealthy_nodes, upgrade_domain_delta_unhealthy_nodes + ) + app_policies = parse_app_health_policy(app_health_map) + + update_desc = UpdateClusterUpgradeDescription( + upgrade_kind=upgrade_kind, update_description=rolling_desc, + cluster_health_policy=health_policy, + enable_delta_health_evaluation=delta_health_evaluation, + cluster_upgrade_health_policy=upgrade_health_policy, + application_health_policy_map=app_policies + ) + + client.update_cluster_upgrade(update_desc, timeout=timeout) diff --git a/src/sfctl/custom_compose.py b/src/sfctl/custom_compose.py new file mode 100644 index 00000000..51835f8e --- /dev/null +++ b/src/sfctl/custom_compose.py @@ -0,0 +1,113 @@ +# ----------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for +# license information. +# ----------------------------------------------------------------------------- + +"""Custom commands for the Service Fabric Docker compose support""" + +from knack.cli import CLIError + +def read_file(file_path): + """Reads a file contents given a file path""" + file_contents = None + with open(file_path) as f_desc: + file_contents = f_desc.read() + if not file_contents: + raise CLIError('Could not read {}'.format(file_path)) + return file_contents + +def repo_creds(username, encrypted_password, has_pass): + """Get a representation of the container repository credentials""" + from azure.servicefabric.models import RegistryCredential + from getpass import getpass + + # Wonky since we allow empty string as an encrypted passphrase + if not any([username, encrypted_password is not None, has_pass]): + return None + + if (encrypted_password is not None) and (not username): + raise CLIError('Missing container repository username') + + if has_pass and (not username): + raise CLIError('Missing container repository username') + + if encrypted_password is not None: + return RegistryCredential(registry_user_name=username, + registry_password=encrypted_password, + password_encrypted=True) + elif has_pass: + passphrase = getpass(prompt='Container repository password: ') + return RegistryCredential(registry_user_name=username, + registry_password=passphrase, + password_encrypted=False) + return RegistryCredential(registry_user_name=username) + +def create_app_health_policy( + warning_as_error, unhealthy_app, default_svc_health_map, + svc_type_health_map): + """Create an application health policy description""" + from sfctl.custom_health import (parse_service_health_policy, + parse_service_health_policy_map) + from azure.servicefabric.models import ApplicationHealthPolicy + + default_svc_type_policy = parse_service_health_policy( + default_svc_health_map + ) + svc_type_policy = parse_service_health_policy_map(svc_type_health_map) + + return ApplicationHealthPolicy( + consider_warning_as_error=warning_as_error, + max_percent_unhealthy_deployed_applications=unhealthy_app, + default_service_type_health_policy=default_svc_type_policy, + service_type_health_policy_map=svc_type_policy + ) + + +def create(client, name, file_path, user=None, has_pass=False, #pylint: disable=missing-docstring + encrypted_pass=None, timeout=60): + from azure.servicefabric.models import CreateComposeDeploymentDescription + + file_contents = read_file(file_path) + credentials = repo_creds(user, encrypted_pass, has_pass) + desc = CreateComposeDeploymentDescription(name, file_contents, + registry_credential=credentials) + client.create_compose_deployment(desc, timeout=timeout) + + +def upgrade(client, name, file_path, user=None, has_pass=False, #pylint: disable=missing-docstring,too-many-locals + encrypted_pass=None, upgrade_kind='Rolling', + upgrade_mode='UnmonitoredAuto', replica_set_check=None, + force_restart=False, failure_action=None, health_check_wait=None, + health_check_stable=None, health_check_retry=None, + upgrade_timeout=None, upgrade_domain_timeout=None, + warning_as_error=False, unhealthy_app=0, + default_svc_type_health_map=None, svc_type_health_map=None, + timeout=60): + from azure.servicefabric.models import ComposeDeploymentUpgradeDescription + from sfctl.custom_cluster_upgrade import create_monitoring_policy + + file_contents = read_file(file_path) + + credentials = repo_creds(user, encrypted_pass, has_pass) + + monitoring_policy = create_monitoring_policy(failure_action, + health_check_wait, + health_check_stable, + health_check_retry, + upgrade_timeout, + upgrade_domain_timeout) + + app_health_policy = create_app_health_policy(warning_as_error, + unhealthy_app, + default_svc_type_health_map, + svc_type_health_map) + + desc = ComposeDeploymentUpgradeDescription( + name, file_contents, registry_credential=credentials, + upgrade_kind=upgrade_kind, rolling_upgrade_mode=upgrade_mode, + upgrade_replica_set_check_timeout_in_seconds=replica_set_check, + force_restart=force_restart, monitoring_policy=monitoring_policy, + application_health_policy=app_health_policy) + + client.start_compose_deployment_upgrade(name, desc, timeout=timeout) diff --git a/src/sfctl/custom_health.py b/src/sfctl/custom_health.py index 84fc1296..db18cab2 100644 --- a/src/sfctl/custom_health.py +++ b/src/sfctl/custom_health.py @@ -68,339 +68,92 @@ def parse_app_health_map(formatted_map): health_map.append(map_item) return health_map -def report_app_health(client, application_id, +def create_health_information(source_id, health_property, health_state, ttl, + description, sequence_number, + remove_when_expired): + """Validates and creates a health information object""" + from azure.servicefabric.models import HealthInformation + + if health_state not in ['Invalid', 'Ok', 'Warning', 'Unknown']: + raise CLIError('Invalid health state specified') + + return HealthInformation(source_id, health_property, health_state, ttl, + description, sequence_number, remove_when_expired) + +def report_cluster_health(client, source_id, health_property, health_state, #pylint: disable=missing-docstring + ttl=None, description=None, sequence_number=None, + remove_when_expired=False, immediate=False, + timeout=60): + health_info = create_health_information(source_id, health_property, + health_state, ttl, description, + sequence_number, + remove_when_expired) + + client.report_cluster_health(health_info, immediate=immediate, + timeout=timeout) + + +def report_app_health(client, application_id, #pylint: disable=missing-docstring source_id, health_property, health_state, ttl=None, description=None, sequence_number=None, - remove_when_expired=None, timeout=60): - """ - Sends a health report on the Service Fabric application. - Reports health state of the specified Service Fabric application. The - report must contain the information about the source of the health report - and property on which it is reported. The report is sent to a Service - Fabric gateway Application, which forwards to the health store. The report - may be accepted by the gateway, but rejected by the health store after - extra validation. For example, the health store may reject the report - because of an invalid parameter, like a stale sequence number. To see - whether the report was applied in the health store, check that the report - appears in the events section. - :param str application_id: The identity of the application. This is - typically the full name of the application without the 'fabric:' URI - scheme. - :param str source_id: The source name which identifies the - client/watchdog/system component which generated the health information. - :param str health_property: The property of the health information. An - entity can have health reports for different properties. The property is a - string and not a fixed enumeration to allow the reporter flexibility to - categorize the state condition that triggers the report. For example, a - reporter with SourceId "LocalWatchdog" can monitor the state of the - available disk on a node, so it can report "AvailableDisk" property on - that node. The same reporter can monitor the node connectivity, so it can - report a property "Connectivity" on the same node. In the health store, - these reports are treated as separate health events for the specified node. - Together with the SourceId, the property uniquely identifies the health - information. - :param str health_state: Possible values include: 'Invalid', 'Ok', - 'Warning', 'Error', 'Unknown' - :param str ttl: The duration, in milliseconds, for which this health report - is valid. When clients report periodically, they should send reports with - higher frequency than time to live. If not specified, time to live defaults - to infinite value. - :param str description: The description of the health information. It - represents free text used to add human readable information about the - report. The maximum string length for the description is 4096 characters. - If the provided string is longer, it will be automatically truncated. - When truncated, the last characters of the description contain a marker - "[Truncated]", and total string size is 4096 characters. The presence of - the marker indicates to users that truncation occurred. Note that when - truncated, the description has less than 4096 characters from the original - string. - :param str sequence_number: The sequence number for this health report as a - numeric string. The report sequence number is used by the health store to - detect stale reports. If not specified, a sequence number is auto-generated - by the health client when a report is added. - :param bool remove_when_expired: Value that indicates whether the report is - removed from health store when it expires. If set to true, the report is - removed from the health store after it expires. If set to false, the report - is treated as an error when expired. The value of this property is false by - default. When clients report periodically, they should set this value to - false (default). This way, is the reporter has issues (eg. deadlock) and - can't report, the entity is evaluated at error when the health report - expires. This flags the entity as being in Error health state. - """ - - from azure.servicefabric.models.health_information import HealthInformation - - info = HealthInformation(source_id, health_property, health_state, ttl, - description, sequence_number, remove_when_expired) + remove_when_expired=None, immediate=False, timeout=60): - client.report_application_health(application_id, info, timeout) + health_info = create_health_information(source_id, health_property, + health_state, ttl, description, + sequence_number, + remove_when_expired) + client.report_application_health(application_id, health_info, + immediate=immediate, timeout=timeout) -def report_svc_health(client, service_id, source_id, health_property, + +def report_svc_health(client, service_id, source_id, health_property, #pylint: disable=missing-docstring health_state, ttl=None, description=None, sequence_number=None, remove_when_expired=None, - timeout=60): - """ - Sends a health report on the Service Fabric service. - Reports health state of the specified Service Fabric service. The - report must contain the information about the source of the health - report and property on which it is reported. The report is sent to a - Service Fabric gateway Service, which forwards to the health store. - The report may be accepted by the gateway, but rejected by the health - store after extra validation. For example, the health store may reject - the report because of an invalid parameter, like a stale sequence number. - To see whether the report was applied in the health store, check that the - report appears in the health events of the service. - :param str service_id: The identity of the service. This is typically the - full name of the service without the 'fabric:' URI scheme. - :param str source_id: The source name which identifies the - client/watchdog/system component which generated the health information. - :param str health_property: The property of the health information. An - entity can have health reports for different properties. The property is a - string and not a fixed enumeration to allow the reporter flexibility to - categorize the state condition that triggers the report. For example, a - reporter with SourceId "LocalWatchdog" can monitor the state of the - available disk on a node, so it can report "AvailableDisk" property on - that node. The same reporter can monitor the node connectivity, so it can - report a property "Connectivity" on the same node. In the health store, - these reports are treated as separate health events for the specified node. - Together with the SourceId, the property uniquely identifies the health - information. - :param str health_state: Possible values include: 'Invalid', 'Ok', - 'Warning', 'Error', 'Unknown' - :param str ttl: The duration, in milliseconds, for which this health report - is valid. When clients report periodically, they should send reports with - higher frequency than time to live. If not specified, time to live defaults - to infinite value. - :param str description: The description of the health information. It - represents free text used to add human readable information about the - report. The maximum string length for the description is 4096 characters. - If the provided string is longer, it will be automatically truncated. - When truncated, the last characters of the description contain a marker - "[Truncated]", and total string size is 4096 characters. The presence of - the marker indicates to users that truncation occurred. Note that when - truncated, the description has less than 4096 characters from the original - string. - :param str sequence_number: The sequence number for this health report as a - numeric string. The report sequence number is used by the health store to - detect stale reports. If not specified, a sequence number is auto-generated - by the health client when a report is added. - :param bool remove_when_expired: Value that indicates whether the report is - removed from health store when it expires. If set to true, the report is - removed from the health store after it expires. If set to false, the report - is treated as an error when expired. The value of this property is false by - default. When clients report periodically, they should set this value to - false (default). This way, is the reporter has issues (eg. deadlock) and - can't report, the entity is evaluated at error when the health report - expires. This flags the entity as being in Error health state. - """ - - from azure.servicefabric.models.health_information import HealthInformation - - info = HealthInformation(source_id, health_property, health_state, ttl, - description, sequence_number, remove_when_expired) + timeout=60, immediate=False): + health_info = create_health_information(source_id, health_property, + health_state, ttl, description, + sequence_number, + remove_when_expired) - client.report_service_health(service_id, info, timeout) + client.report_service_health(service_id, health_info, timeout=timeout, + immediate=immediate) -def report_partition_health(client, partition_id, source_id, health_property, +def report_partition_health(client, partition_id, source_id, health_property, #pylint: disable=missing-docstring health_state, ttl=None, description=None, sequence_number=None, remove_when_expired=None, - timeout=60): - """ - Sends a health report on the Service Fabric partition. - Reports health state of the specified Service Fabric partition. The - report must contain the information about the source of the health - report and property on which it is reported. The report is sent to a - Service Fabric gateway Partition, which forwards to the health store. - The report may be accepted by the gateway, but rejected by the health - store after extra validation. For example, the health store may reject - the report because of an invalid parameter, like a stale sequence number. - To see whether the report was applied in the health store, check that the - report appears in the events section. - :param str partition_id: The identity of the partition. - :param str source_id: The source name which identifies the - client/watchdog/system component which generated the health information. - :param str health_property: The property of the health information. An - entity can have health reports for different properties. The property is a - string and not a fixed enumeration to allow the reporter flexibility to - categorize the state condition that triggers the report. For example, a - reporter with SourceId "LocalWatchdog" can monitor the state of the - available disk on a node, so it can report "AvailableDisk" property on - that node. The same reporter can monitor the node connectivity, so it can - report a property "Connectivity" on the same node. In the health store, - these reports are treated as separate health events for the specified node. - Together with the SourceId, the property uniquely identifies the health - information. - :param str health_state: Possible values include: 'Invalid', 'Ok', - 'Warning', 'Error', 'Unknown' - :param str ttl: The duration, in milliseconds, for which this health report - is valid. When clients report periodically, they should send reports with - higher frequency than time to live. If not specified, time to live defaults - to infinite value. - :param str description: The description of the health information. It - represents free text used to add human readable information about the - report. The maximum string length for the description is 4096 characters. - If the provided string is longer, it will be automatically truncated. - When truncated, the last characters of the description contain a marker - "[Truncated]", and total string size is 4096 characters. The presence of - the marker indicates to users that truncation occurred. Note that when - truncated, the description has less than 4096 characters from the original - string. - :param str sequence_number: The sequence number for this health report as a - numeric string. The report sequence number is used by the health store to - detect stale reports. If not specified, a sequence number is auto-generated - by the health client when a report is added. - :param bool remove_when_expired: Value that indicates whether the report is - removed from health store when it expires. If set to true, the report is - removed from the health store after it expires. If set to false, the report - is treated as an error when expired. The value of this property is false by - default. When clients report periodically, they should set this value to - false (default). This way, is the reporter has issues (eg. deadlock) and - can't report, the entity is evaluated at error when the health report - expires. This flags the entity as being in Error health state. - """ - - from azure.servicefabric.models.health_information import HealthInformation - - info = HealthInformation(source_id, health_property, health_state, ttl, - description, sequence_number, remove_when_expired) - client.report_partition_health(partition_id, info, timeout) + immediate=False, timeout=60): + + health_info = create_health_information(source_id, health_property, + health_state, ttl, description, + sequence_number, + remove_when_expired) + client.report_partition_health(partition_id, health_info, timeout=timeout, + immediate=immediate) -def report_replica_health(client, partition_id, replica_id, source_id, +def report_replica_health(client, partition_id, replica_id, source_id, #pylint: disable=missing-docstring health_state, health_property, service_kind="Stateful", ttl=None, description=None, sequence_number=None, remove_when_expired=None, - timeout=60): - """ - Sends a health report on the Service Fabric replica. - Reports health state of the specified Service Fabric replica. The - report must contain the information about the source of the health - report and property on which it is reported. The report is sent to a - Service Fabric gateway Replica, which forwards to the health store. The - report may be accepted by the gateway, but rejected by the health store - after extra validation. For example, the health store may reject the - report because of an invalid parameter, like a stale sequence number. - To see whether the report was applied in the health store, check that - the report appears in the events section. - :param str partition_id: The identity of the partition. - :param str replica_id: The identifier of the replica. - :param str service_kind: The kind of service replica (Stateless or - Stateful) for which the health is being reported. Following are the - possible values: `Stateless`, `Stateful`. - :param str source_id: The source name which identifies the - client/watchdog/system component which generated the health information. - :param str health_property: The property of the health information. An - entity can have health reports for different properties. The property is a - string and not a fixed enumeration to allow the reporter flexibility to - categorize the state condition that triggers the report. For example, a - reporter with SourceId "LocalWatchdog" can monitor the state of the - available disk on a node, so it can report "AvailableDisk" property on - that node. The same reporter can monitor the node connectivity, so it can - report a property "Connectivity" on the same node. In the health store, - these reports are treated as separate health events for the specified node. - Together with the SourceId, the property uniquely identifies the health - information. - :param str health_state: Possible values include: 'Invalid', 'Ok', - 'Warning', 'Error', 'Unknown' - :param str ttl: The duration, in milliseconds, for which this health report - is valid. When clients report periodically, they should send reports with - higher frequency than time to live. If not specified, time to live defaults - to infinite value. - :param str description: The description of the health information. It - represents free text used to add human readable information about the - report. The maximum string length for the description is 4096 characters. - If the provided string is longer, it will be automatically truncated. - When truncated, the last characters of the description contain a marker - "[Truncated]", and total string size is 4096 characters. The presence of - the marker indicates to users that truncation occurred. Note that when - truncated, the description has less than 4096 characters from the original - string. - :param str sequence_number: The sequence number for this health report as a - numeric string. The report sequence number is used by the health store to - detect stale reports. If not specified, a sequence number is auto-generated - by the health client when a report is added. - :param bool remove_when_expired: Value that indicates whether the report is - removed from health store when it expires. If set to true, the report is - removed from the health store after it expires. If set to false, the report - is treated as an error when expired. The value of this property is false by - default. When clients report periodically, they should set this value to - false (default). This way, is the reporter has issues (eg. deadlock) and - can't report, the entity is evaluated at error when the health report - expires. This flags the entity as being in Error health state. - """ - - from azure.servicefabric.models.health_information import HealthInformation - - info = HealthInformation(source_id, health_property, health_state, ttl, - description, sequence_number, remove_when_expired) + immediate=False, timeout=60): + + info = create_health_information(source_id, health_property, health_state, + ttl, description, sequence_number, + remove_when_expired) client.report_replica_health(partition_id, replica_id, info, - service_kind, timeout) + service_kind, timeout=timeout, + immediate=immediate) -def report_node_health(client, node_name, source_id, health_property, +def report_node_health(client, node_name, source_id, health_property, #pylint: disable=missing-docstring health_state, ttl=None, description=None, sequence_number=None, remove_when_expired=None, - timeout=60): - """ - Sends a health report on the Service Fabric node. - Reports health state of the specified Service Fabric node. The report - must contain the information about the source of the health report - and property on which it is reported. The report is sent to a Service - Fabric gateway node, which forwards to the health store. The report may be - accepted by the gateway, but rejected by the health store after extra - validation. For example, the health store may reject the report because of - an invalid parameter, like a stale sequence number. To see whether the - report was applied in the health store, check that the report appears in - the events section. - :param str node_name: The name of the node. - :param str source_id: The source name which identifies the - client/watchdog/system component which generated the health information. - :param str health_property: The property of the health information. An - entity can have health reports for different properties. The property is a - string and not a fixed enumeration to allow the reporter flexibility to - categorize the state condition that triggers the report. For example, a - reporter with SourceId "LocalWatchdog" can monitor the state of the - available disk on a node, so it can report "AvailableDisk" property on - that node. The same reporter can monitor the node connectivity, so it can - report a property "Connectivity" on the same node. In the health store, - these reports are treated as separate health events for the specified node. - Together with the SourceId, the property uniquely identifies the health - information. - :param str health_state: Possible values include: 'Invalid', 'Ok', - 'Warning', 'Error', 'Unknown' - :param str ttl: The duration, in milliseconds, for which this health report - is valid. When clients report periodically, they should send reports with - higher frequency than time to live. If not specified, time to live defaults - to infinite value. - :param str description: The description of the health information. It - represents free text used to add human readable information about the - report. The maximum string length for the description is 4096 characters. - If the provided string is longer, it will be automatically truncated. - When truncated, the last characters of the description contain a marker - "[Truncated]", and total string size is 4096 characters. The presence of - the marker indicates to users that truncation occurred. Note that when - truncated, the description has less than 4096 characters from the original - string. - :param str sequence_number: The sequence number for this health report as a - numeric string. The report sequence number is used by the health store to - detect stale reports. If not specified, a sequence number is auto-generated - by the health client when a report is added. - :param bool remove_when_expired: Value that indicates whether the report is - removed from health store when it expires. If set to true, the report is - removed from the health store after it expires. If set to false, the report - is treated as an error when expired. The value of this property is false by - default. When clients report periodically, they should set this value to - false (default). This way, is the reporter has issues (eg. deadlock) and - can't report, the entity is evaluated at error when the health report - expires. This flags the entity as being in Error health state. - """ - - from azure.servicefabric.models.health_information import HealthInformation - - info = HealthInformation(source_id, health_property, health_state, ttl, - description, sequence_number, remove_when_expired) + immediate=False, timeout=60): - client.report_node_health(node_name, info, timeout) + info = create_health_information(source_id, health_property, health_state, + ttl, description, sequence_number, + remove_when_expired) + client.report_node_health(node_name, info, immediate, timeout) diff --git a/src/sfctl/custom_service.py b/src/sfctl/custom_service.py index 2cf0aae3..7f3139e3 100644 --- a/src/sfctl/custom_service.py +++ b/src/sfctl/custom_service.py @@ -224,9 +224,13 @@ def create( # pylint: disable=too-many-arguments, too-many-locals stand_by_replica_keep=None, no_persisted_state=False, instance_count=None, timeout=60): """ - Creates the specified Service Fabric service from the description. - :param str app_id: The identity of the parent application. This is - typically the full id of the application without the 'fabric:' URI scheme. + Creates the specified Service Fabric service. + :param str app_id: The identity of the application. This is + typically the full name of the application without the 'fabric:' URI + scheme. Starting from version 6.0, hierarchical names are delimited with + the '~' character. For example, if the application name is + 'fabric://myapp/app1', the application identity would be 'myapp~app1' in + 6.0+ and 'myapp/app1' in previous versions. :param str name: Name of the service. This should be a child of the application id. This is the full name including the `fabric:` URI. For example service `fabric:/A/B` is a child of application @@ -379,8 +383,12 @@ def update(client, service_id, stateless=False, stateful=False, #pylint: disable quorum_loss_wait=None, stand_by_replica_keep=None, timeout=60): """ Updates the specified service using the given update description. - :param str service_id: Target service to update. This is typically the full - id of the service without the 'fabric:' URI scheme. + :param str service_id: The identity of the service. This is typically the + full name of the service without the 'fabric:' URI scheme. Starting from + version 6.0, hierarchical names are delimited with the "~" character. For + example, if the service name is 'fabric://myapp/app1/svc1', the service + identity would be 'myapp~app1~svc1' in 6.0+ and 'myapp/app1/svc1' in + previous versions. :param bool stateless: Indicates the target service is a stateless service. :param bool stateful: Indicates the target service is a stateful service. :param str constraints: The placement constraints as a string. Placement diff --git a/src/sfctl/helps/app.py b/src/sfctl/helps/app.py index 284916be..858f322e 100644 --- a/src/sfctl/helps/app.py +++ b/src/sfctl/helps/app.py @@ -4,7 +4,7 @@ # license information. # ----------------------------------------------------------------------------- -"""Help documentation for Service Fabric compose commands.""" +"""Help documentation for Service Fabric application and compose commands.""" from knack.help_files import helps @@ -19,10 +19,9 @@ helps['application upload'] = """ type: command - short-summary: Copy a Service Fabric application package to the image - store - long-summary: Optionally display upload progress for each file in the - package. Upload progress is sent to `stderr` + short-summary: Copy a Service Fabric application package to the image store + long-summary: Optionally display upload progress for each file in the + package. Upload progress is sent to `stderr` parameters: - name: --path type: string @@ -35,3 +34,88 @@ short-summary: Destination image store to upload the application package to """ + +helps['application upgrade'] = """ + type: command + short-summary: Starts upgrading an application in the Service Fabric + cluster + long-summary: Validates the supplied application upgrade parameters and + starts upgrading the application if the parameters are valid. Note + that upgrade description replaces the existing application description. + This means that if the parameters are not specified, the existing + parameters on the applications will be overwritten with the empty + parameters list. This would results in application using the default + value of the parameters from the application manifest. + parameters: + - name: --app-id + type: string + short-summary: The identity of the application. + long-summary: "This is typically the full name of the application + without the 'fabric:' URI scheme. Starting from version 6.0, + hierarchical names are delimited with the '~' character. For + example, if the application name is 'fabric://myapp/app1', the + application identity would be 'myapp~app1' in 6.0+ and 'myapp/app1' + in previous versions." + - name: --app-version + type: string + short-summary: Target application version + - name: --parameters + type: string + short-summary: A JSON encoded list of application parameter overrides + to be applied when upgrading the application + - name: --mode + type: string + short-summary: The mode used to monitor health during a rolling + upgrade + - name: --replica-set-check-timeout + type: int + short-summary: The maximum amount of time to block processing of an + upgrade domain and prevent loss of availability when there are + unexpected issues. Measured in seconds. + - name: --force-restart + type: bool + short-summary: Forcefully restart processes during upgrade even + when the code version has not changed + - name: --failure-action + type: string + short-summary: The action to perform when a Monitored upgrade + encounters monitoring policy or health policy violations + - name: --health-check-wait-duration + type: string + short-summary: The amount of time to wait after completing an upgrade + domain before applying health policies. Measured in milliseconds. + - name: --health-check-stable-duration + type: string + short-summary: The amount of time that the application or cluster + must remain healthy before the upgrade proceeds to the next + upgrade domain. Measured in milliseconds. + - name: --health-check-retry-timeout + type: string + short-summary: The amount of time to retry health evaluations when + the application or cluster is unhealthy before the failure action + is executed. Measured in milliseconds. + - name: --upgrade-timeout + type: string + short-summary: The amount of time the overall upgrade has to complete + before FailureAction is executed. Measured in milliseconds. + - name: --upgrade-domain-timeout + type: string + short-summary: The amount of time each upgrade domain has to complete + before FailureAction is executed. Measured in milliseconds. + - name: --warning-as-error + type: bool + short-summary: Treat health evaluation warnings with the same + severity as errors + - name: --max-unhealthy-apps + type: int + short-summary: The maximum allowed percentage of unhealthy deployed + applications. Represented as a number between 0 and 100. + - name: --default-service-health-policy + type: string + short-summary: JSON encoded specification of the health policy used + by default to evaluate the health of a service type + - name: --service-health-policy + type: string + short-summary: JSON encoded map with service type health + policy per service type name. The map is empty be default. +""" diff --git a/src/sfctl/helps/cluster_upgrade.py b/src/sfctl/helps/cluster_upgrade.py new file mode 100644 index 00000000..8e6bdd6b --- /dev/null +++ b/src/sfctl/helps/cluster_upgrade.py @@ -0,0 +1,272 @@ +# ----------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for +# license information. +# ----------------------------------------------------------------------------- + +"""Help documentation for Service Fabric cluster upgrade commands""" + +from knack.help_files import helps + +helps['cluster upgrade'] = """ + type: command + short-summary: Start upgrading the code or configuration version of a + Service Fabric cluster + long-summary: Validate the supplied upgrade parameters and start upgrading + the code or configuration version of a Service Fabric cluster if the + parameters are valid. + parameters: + - name: --code-version + type: string + short-summary: The cluster code version + - name: --config-version + type: string + short-summary: The cluster configuration version + - name: --rolling-upgrade-mode + type: string + short-summary: "Possible values include: 'Invalid', + 'UnmonitoredAuto', 'UnmonitoredManual', 'Monitored'" + - name: --replica-set-check-timeout + type: string + short-summary: Upgrade replica set check timeout measured in + seconds + - name: --force-restart + type: bool + short-summary: Force restart + - name: --failure-action + type: string + short-summary: "Possible values include: 'Invalid', 'Rollback', + 'Manual'" + - name: --health-check-wait + type: string + short-summary: Health check wait duration measured in milliseconds + - name: --health-check-stable + type: string + short-summary: Health check stable duration measured in milliseconds + - name: --health-check-retry + type: string + short-summary: Health check retry timeout measured in milliseconds + - name: --upgrade-timeout + type: string + short-summary: Upgrade timeout measured in milliseconds + - name: --upgrade-domain-timeout + type: string + short-summary: Upgrade domain timeout measured in milliseconds + - name: --warning-as-error + type: bool + short-summary: Warnings are treated with the same severity as errors + - name: --unhealthy-nodes + type: int + short-summary: The maximum allowed percentage of unhealthy nodes + before reporting an error + long-summary: For example, to allow 10% of nodes to be unhealthy, + this value would be 10. The percentage represents the maximum + tolerated percentage of nodes that can be unhealthy before the + cluster is considered in error. If the percentage is respected but + there is at least one unhealthy node, the health is evaluated as + Warning. The percentage is calculated by dividing the number of + unhealthy nodes over the total number of nodes in the cluster. The + computation rounds up to tolerate one failure on small numbers of + nodes. In large clusters, some nodes will always be down or out for + repairs, so this percentage should be configured to tolerate that. + - name: --unhealthy-applications + type: int + short-summary: The maximum allowed percentage of unhealthy + applications before reporting an error + long-summary: For example, to allow 10% of applications to be + unhealthy, this value would be 10. The percentage represents the + maximum tolerated percentage of applications that can be unhealthy + before the cluster is considered in error. If the percentage is + respected but there is at least one unhealthy application, the + health is evaluated as Warning. This is calculated by dividing the + number of unhealthy applications over the total number of + application instances in the cluster, excluding applications of + application types that are included in the + ApplicationTypeHealthPolicyMap. The computation rounds up to + tolerate one failure on small numbers of applications. + - name: --app-type-health-map + type: string + short-summary: JSON encoded dictionary of pairs of application type + name and maximum percentage unhealthy before raising error + - name: --delta-health-evaluation + type: bool + short-summary: Enables delta health evaluation rather than absolute + health evaluation after completion of each upgrade domain + - name: --delta-unhealthy-nodes + type: int + short-summary: The maximum allowed percentage of nodes health + degradation allowed during cluster upgrades + long-summary: The delta is measured between the state of the nodes at + the beginning of upgrade and the state of the nodes at the time of + the health evaluation. The check is performed after every upgrade + domain upgrade completion to make sure the global state of the + cluster is within tolerated limits. + - name: --upgrade-domain-delta-unhealthy-nodes + type: int + short-summary: The maximum allowed percentage of upgrade domain nodes + health degradation allowed during cluster upgrades + long-summary: The delta is measured between the state of the + upgrade domain nodes at the beginning of upgrade and the state of + the upgrade domain nodes at the time of the health evaluation. The + check is performed after every upgrade domain upgrade completion + for all completed upgrade domains to make sure the state of the + upgrade domains is within tolerated limits. + - name: --app-health-map + type: string + short-summary: JSON encoded dictionary of pairs of application name + and maximum percentage unhealthy before raising error +""" + +helps['sa-cluster config-upgrade'] = """ + type: command + short-summary: Start upgrading the configuration of a Service Fabric + standalone cluster + long-summary: Validate the supplied configuration upgrade parameters and + start upgrading the cluster configuration if the parameters are valid. + parameters: + - name: --cluster-config + type: string + short-summary: The cluster configuration + - name: --health-check-retry + type: string + short-summary: The length of time between attempts to perform a + health checks if the application or cluster is not healthy + - name: --health-check-wait + type: string + short-summary: The length of time to wait after completing an + upgrade domain before starting the health checks process + - name: --health-check-stable + type: string + short-summary: The length of time that the application or cluster + must remain healthy + - name: --upgrade-domain-timeout + type: string + short-summary: The timeout for the upgrade domain + - name: --upgrade-timeout + type: string + short-summary: The upgrade timeout + - name: --unhealthy-applications + type: int + short-summary: The maximum allowed percentage of unhealthy + applications during the upgrade. Allowed values are integer values + from zero to 100. + - name: --unhealthy-nodes + type: int + short-summary: The maximum allowed percentage of unhealthy nodes + during the upgrade. Allowed values are integer values from zero + to 100. + - name: --delta-unhealthy-nodes + type: int + short-summary: The maximum allowed percentage of delta health + degradation during the upgrade. Allowed values are integer values + from zero to 100. + - name: --upgrade-domain-delta-unhealthy-nodes + type: int + short-summary: The maximum allowed percentage of upgrade domain delta + health degradation during the upgrade. Allowed values are integer + values from zero to 100. +""" + +helps['cluster upgrade-update'] = """ + type: command + short-summary: Update the upgrade parameters of a Service Fabric cluster + upgrade + parameters: + - name: --upgrade-kind + type: string + short-summary: "Possible values include: 'Invalid', 'Rolling', + 'Rolling_ForceRestart'" + - name: --rolling-upgrade-mode + type: string + short-summary: "Possible values include: 'Invalid', + 'UnmonitoredAuto', 'UnmonitoredManual', 'Monitored'" + - name: --replica-set-check-timeout + type: string + short-summary: Upgrade replica set check timeout measured in + seconds + - name: --force-restart + type: bool + short-summary: Force restart + - name: --failure-action + type: string + short-summary: "Possible values include: 'Invalid', 'Rollback', + 'Manual'" + - name: --health-check-wait + type: string + short-summary: Health check wait duration measured in milliseconds + - name: --health-check-stable + type: string + short-summary: Health check stable duration measured in milliseconds + - name: --health-check-retry + type: string + short-summary: Health check retry timeout measured in milliseconds + - name: --upgrade-timeout + type: string + short-summary: Upgrade timeout measured in milliseconds + - name: --upgrade-domain-timeout + type: string + short-summary: Upgrade domain timeout measured in milliseconds + - name: --warning-as-error + type: bool + short-summary: Warnings are treated with the same severity as errors + - name: --unhealthy-nodes + type: int + short-summary: The maximum allowed percentage of unhealthy nodes + before reporting an error + long-summary: For example, to allow 10% of nodes to be unhealthy, + this value would be 10. The percentage represents the maximum + tolerated percentage of nodes that can be unhealthy before the + cluster is considered in error. If the percentage is respected but + there is at least one unhealthy node, the health is evaluated as + Warning. The percentage is calculated by dividing the number of + unhealthy nodes over the total number of nodes in the cluster. The + computation rounds up to tolerate one failure on small numbers of + nodes. In large clusters, some nodes will always be down or out for + repairs, so this percentage should be configured to tolerate that. + - name: --unhealthy-applications + type: int + short-summary: The maximum allowed percentage of unhealthy + applications before reporting an error + long-summary: For example, to allow 10% of applications to be + unhealthy, this value would be 10. The percentage represents the + maximum tolerated percentage of applications that can be unhealthy + before the cluster is considered in error. If the percentage is + respected but there is at least one unhealthy application, the + health is evaluated as Warning. This is calculated by dividing the + number of unhealthy applications over the total number of + application instances in the cluster, excluding applications of + application types that are included in the + ApplicationTypeHealthPolicyMap. The computation rounds up to + tolerate one failure on small numbers of applications. + - name: --app-type-health-map + type: string + short-summary: JSON encoded dictionary of pairs of application type + name and maximum percentage unhealthy before raising error + - name: --delta-health-evaluation + type: bool + short-summary: Enables delta health evaluation rather than absolute + health evaluation after completion of each upgrade domain + - name: --delta-unhealthy-nodes + type: int + short-summary: The maximum allowed percentage of nodes health + degradation allowed during cluster upgrades + long-summary: The delta is measured between the state of the nodes at + the beginning of upgrade and the state of the nodes at the time of + the health evaluation. The check is performed after every upgrade + domain upgrade completion to make sure the global state of the + cluster is within tolerated limits. + - name: --upgrade-domain-delta-unhealthy-nodes + type: int + short-summary: The maximum allowed percentage of upgrade domain nodes + health degradation allowed during cluster upgrades + long-summary: The delta is measured between the state of the + upgrade domain nodes at the beginning of upgrade and the state of + the upgrade domain nodes at the time of the health evaluation. The + check is performed after every upgrade domain upgrade completion + for all completed upgrade domains to make sure the state of the + upgrade domains is within tolerated limits. + - name: --app-health-map + type: string + short-summary: JSON encoded dictionary of pairs of application name + and maximum percentage unhealthy before raising error +""" \ No newline at end of file diff --git a/src/sfctl/helps/compose.py b/src/sfctl/helps/compose.py new file mode 100644 index 00000000..d6a2faab --- /dev/null +++ b/src/sfctl/helps/compose.py @@ -0,0 +1,110 @@ +# ----------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for +# license information. +# ----------------------------------------------------------------------------- + +"""Help documentation for Service Fabric Docker compose related commands.""" + +from knack.help_files import helps + +helps['compose create'] = """ + type: command + short-summary: Creates a Service Fabric compose deployment + parameters: + - name: --name + type: string + short-summary: The identity of the deployment + - name: --file-path + type: string + short-summary: Path to the target Docker compose file + - name: --user + type: string + short-summary: User name to connect to container registry + - name: --has-pass + type: bool + short-summary: Will prompt for a password to the container registry + - name: --encrypted-pass + type: string + short-summary: Rather than prompting for a container registry + password, use an already encrypted passphrase +""" + +helps['compose upgrade'] = """ + type: command + short-summary: Starts upgrading a compose deployment in the Service Fabric + cluster + long-summary: Validates the supplied upgrade parameters and starts + upgrading the deployment if the parameters are valid + parameters: + - name: --name + type: string + short-summary: The identity of the deployment + - name: --file-path + type: string + short-summary: Path to the target Docker compose file + - name: --user + type: string + short-summary: User name to connect to container registry + - name: --has-pass + type: bool + short-summary: Will prompt for a password to the container registry + - name: --encrypted-pass + type: string + short-summary: Rather than prompting for a container registry + password, use an already encrypted passphrase + - name: --upgrade-mode + type: string + short-summary: "Possible values include: 'Invalid', + 'UnmonitoredAuto', 'UnmonitoredManual', 'Monitored'" + - name: --replica-set-check + type: string + short-summary: Upgrade replica set check timeout measured in + seconds + - name: --force-restart + type: bool + short-summary: Force restart + - name: --failure-action + type: string + short-summary: "Possible values include: 'Invalid', 'Rollback', + 'Manual'" + - name: --health-check-wait + type: string + short-summary: Health check wait duration measured in milliseconds + - name: --health-check-stable + type: string + short-summary: Health check stable duration measured in milliseconds + - name: --health-check-retry + type: string + short-summary: Health check retry timeout measured in milliseconds + - name: --upgrade-timeout + type: string + short-summary: Upgrade timeout measured in milliseconds + - name: --upgrade-domain-timeout + type: string + short-summary: Upgrade domain timeout measured in milliseconds + - name: --warning-as-error + type: bool + short-summary: Warnings are treated with the same severity as errors + - name: --unhealthy-app + type: int + short-summary: The maximum allowed percentage of unhealthy + applications before reporting an error + long-summary: For example, to allow 10% of applications to be + unhealthy, this value would be 10. The percentage represents the + maximum tolerated percentage of applications that can be unhealthy + before the cluster is considered in error. If the percentage is + respected but there is at least one unhealthy application, the + health is evaluated as Warning. This is calculated by dividing the + number of unhealthy applications over the total number of + application instances in the cluster. + - name: --default-svc-type-health-map + type: string + short-summary: JSON encoded dictionary that describe the + health policy used to evaluate the health of services + - name: --svc-type-health-map + type: string + short-summary: JSON encoded list of objects that describe the + health policies used to evaluate the health of different service + types +""" \ No newline at end of file diff --git a/src/sfctl/helps/health.py b/src/sfctl/helps/health.py new file mode 100644 index 00000000..b13607fa --- /dev/null +++ b/src/sfctl/helps/health.py @@ -0,0 +1,645 @@ +# ----------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for +# license information. +# ----------------------------------------------------------------------------- + +"""Help documentation for Service Fabric health commands.""" + +from knack.help_files import helps + +helps['cluster report-health'] = """ + type: command + short-summary: Sends a health report on the Service Fabric cluster. + long-summary: The report must contain the information about the source of + the health report and property on which it is reported. The report is + sent to a Service Fabric gateway node, which forwards to the health + store. The report may be accepted by the gateway, but rejected by the + health store after extra validation. For example, the health store may + reject the report because of an invalid parameter, like a stale + sequence number. To see whether the report was applied in the health + store, check that the report appears in the HealthEvents of the + cluster. + parameters: + - name: --source-id + type: string + short-summary: The source name which identifies the + client/watchdog/system component which generated the health + information. + - name: --health-property + type: string + short-summary: The property of the health information. + long-summary: An entity can have health reports for different + properties. The property is a string and not a fixed enumeration to + allow the reporter flexibility to categorize the state condition that + triggers the report. For example, a reporter with SourceId + "LocalWatchdog" can monitor the state of the available disk on a + node, so it can report "AvailableDisk" property on that node. The + same reporter can monitor the node connectivity, so it can report a + property "Connectivity" on the same node. In the health store, + these reports are treated as separate health events for the + specified node. Together with the SourceId, the property uniquely + identifies the health information. + - name: --health-state + type: string + short-summary: "Possible values include: 'Invalid', 'Ok', 'Warning', + 'Error', 'Unknown'" + - name: --ttl + type: string + short-summary: The duration for which this health report is valid. + This field is using ISO8601 format for specifying the duration. + long-summary: When clients report periodically, they should send + reports with higher frequency than time to live. If clients report + on transition, they can set the time to live to infinite. When + time to live expires, the health event that contains the health + information is either removed from health store, if + RemoveWhenExpired is true, or evaluated at error, if + RemoveWhenExpired false. If not specified, time to live defaults + to infinite value. + - name: --description + type: string + short-summary: The description of the health information. + long-summary: It represents free text used to add human readable + information about the report. The maximum string length for the + description is 4096 characters. If the provided string is longer, + it will be automatically truncated. When truncated, the last + characters of the description contain a marker "[Truncated]", and + total string size is 4096 characters. The presence of the marker + indicates to users that truncation occurred. Note that when + truncated, the description has less than 4096 characters from the + original string. + - name: --sequence-number + type: string + short-summary: The sequence number for this health report as a + numeric string. + long-summary: The report sequence number is used by the health store + to detect stale reports. If not specified, a sequence number is + auto-generated by the health client when a report is added. + - name: --remove-when-expired + type: bool + short-summary: Value that indicates whether the report is removed + from health store when it expires. + long-summary: If set to true, the report is removed from the health + store after it expires. If set to false, the report is treated as + an error when expired. The value of this property is false by + default. When clients report periodically, they should set + RemoveWhenExpired false (default). This way, is the reporter has + issues (eg. deadlock) and can't report, the entity is evaluated at + error when the health report expires. This flags the entity as + being in Error health state. + - name: --immediate + type: bool + short-summary: A flag which indicates whether the report should be + sent immediately. + long-summary: A health report is sent to a Service Fabric gateway + Application, which forwards to the health store. If Immediate is + set to true, the report is sent immediately from Http Gateway to + the health store, regardless of the fabric client settings that + the Http Gateway Application is using. This is useful for critical + reports that should be sent as soon as possible. Depending on + timing and other conditions, sending the report may still fail, + for example if the Http Gateway is closed or the message doesn't + reach the Gateway. If Immediate is set to false, the report is sent + based on the health client settings from the Http Gateway. + Therefore, it will be batched according to the + HealthReportSendInterval configuration. This is the recommended + setting because it allows the health client to optimize health + reporting messages to health store as well as health report + processing. By default, reports are not sent immediately. +""" + +helps['node report-health'] = """ + type: command + short-summary: Sends a health report on the Service Fabric node. + long-summary: Reports health state of the specified Service Fabric node. + The report must contain the information about the source of the health + report and property on which it is reported. The report is sent to a + Service Fabric gateway node, which forwards to the health store. The + report may be accepted by the gateway, but rejected by the health store + after extra validation. For example, the health store may reject the + report because of an invalid parameter, like a stale sequence number. + To see whether the report was applied in the health store, check that + the report appears in the HealthEvents section. + parameters: + - name: --node-name + type: string + short-summary: Node name to report on + - name: --source-id + type: string + short-summary: The source name which identifies the + client/watchdog/system component which generated the health + information. + - name: --health-property + type: string + short-summary: The property of the health information. + long-summary: An entity can have health reports for different + properties. The property is a string and not a fixed enumeration to + allow the reporter flexibility to categorize the state condition that + triggers the report. For example, a reporter with SourceId + "LocalWatchdog" can monitor the state of the available disk on a + node, so it can report "AvailableDisk" property on that node. The + same reporter can monitor the node connectivity, so it can report a + property "Connectivity" on the same node. In the health store, + these reports are treated as separate health events for the + specified node. Together with the SourceId, the property uniquely + identifies the health information. + - name: --health-state + type: string + short-summary: "Possible values include: 'Invalid', 'Ok', 'Warning', + 'Error', 'Unknown'" + - name: --ttl + type: string + short-summary: The duration for which this health report is valid. + This field is using ISO8601 format for specifying the duration. + long-summary: When clients report periodically, they should send + reports with higher frequency than time to live. If clients report + on transition, they can set the time to live to infinite. When + time to live expires, the health event that contains the health + information is either removed from health store, if + RemoveWhenExpired is true, or evaluated at error, if + RemoveWhenExpired false. If not specified, time to live defaults + to infinite value. + - name: --description + type: string + short-summary: The description of the health information. + long-summary: It represents free text used to add human readable + information about the report. The maximum string length for the + description is 4096 characters. If the provided string is longer, + it will be automatically truncated. When truncated, the last + characters of the description contain a marker "[Truncated]", and + total string size is 4096 characters. The presence of the marker + indicates to users that truncation occurred. Note that when + truncated, the description has less than 4096 characters from the + original string. + - name: --sequence-number + type: string + short-summary: The sequence number for this health report as a + numeric string. + long-summary: The report sequence number is used by the health store + to detect stale reports. If not specified, a sequence number is + auto-generated by the health client when a report is added. + - name: --remove-when-expired + type: bool + short-summary: Value that indicates whether the report is removed + from health store when it expires. + long-summary: If set to true, the report is removed from the health + store after it expires. If set to false, the report is treated as + an error when expired. The value of this property is false by + default. When clients report periodically, they should set + RemoveWhenExpired false (default). This way, is the reporter has + issues (eg. deadlock) and can't report, the entity is evaluated at + error when the health report expires. This flags the entity as + being in Error health state. + - name: --immediate + type: bool + short-summary: A flag which indicates whether the report should be + sent immediately. + long-summary: A health report is sent to a Service Fabric gateway + Application, which forwards to the health store. If Immediate is + set to true, the report is sent immediately from Http Gateway to + the health store, regardless of the fabric client settings that + the Http Gateway Application is using. This is useful for critical + reports that should be sent as soon as possible. Depending on + timing and other conditions, sending the report may still fail, + for example if the Http Gateway is closed or the message doesn't + reach the Gateway. If Immediate is set to false, the report is sent + based on the health client settings from the Http Gateway. + Therefore, it will be batched according to the + HealthReportSendInterval configuration. This is the recommended + setting because it allows the health client to optimize health + reporting messages to health store as well as health report + processing. By default, reports are not sent immediately. +""" + +helps['application report-health'] = """ + type: command + short-summary: Sends a health report on the Service Fabric application + long-summary: Reports health state of the specified Service Fabric + application. The report must contain the information about the source + of the health report and property on which it is reported. + The report is sent to a Service Fabric gateway Application, which + forwards to the health store. The report may be accepted by the + gateway, but rejected by the health store after extra validation. + For example, the health store may reject the report because of an + invalid parameter, like a stale sequence number. To see whether the + report was applied in the health store, get application health and + check that the report appears. + parameters: + - name: --application-id + type: string + short-summary: The identity of the application + long-summary: "This is typically the full name of the application + without the 'fabric:' URI scheme. Starting from version 6.0, + hierarchical names are delimited with the '~' character. For + example, if the application name is 'fabric://myapp/app1', the + application identity would be 'myapp~app1' in 6.0+ and 'myapp/app1' + in previous versions." + - name: --source-id + type: string + short-summary: The source name which identifies the + client/watchdog/system component which generated the health + information. + - name: --health-property + type: string + short-summary: The property of the health information. + long-summary: An entity can have health reports for different + properties. The property is a string and not a fixed enumeration to + allow the reporter flexibility to categorize the state condition that + triggers the report. For example, a reporter with SourceId + "LocalWatchdog" can monitor the state of the available disk on a + node, so it can report "AvailableDisk" property on that node. The + same reporter can monitor the node connectivity, so it can report a + property "Connectivity" on the same node. In the health store, + these reports are treated as separate health events for the + specified node. Together with the SourceId, the property uniquely + identifies the health information. + - name: --health-state + type: string + short-summary: "Possible values include: 'Invalid', 'Ok', 'Warning', + 'Error', 'Unknown'" + - name: --ttl + type: string + short-summary: The duration for which this health report is valid. + This field is using ISO8601 format for specifying the duration. + long-summary: When clients report periodically, they should send + reports with higher frequency than time to live. If clients report + on transition, they can set the time to live to infinite. When + time to live expires, the health event that contains the health + information is either removed from health store, if + RemoveWhenExpired is true, or evaluated at error, if + RemoveWhenExpired false. If not specified, time to live defaults + to infinite value. + - name: --description + type: string + short-summary: The description of the health information. + long-summary: It represents free text used to add human readable + information about the report. The maximum string length for the + description is 4096 characters. If the provided string is longer, + it will be automatically truncated. When truncated, the last + characters of the description contain a marker "[Truncated]", and + total string size is 4096 characters. The presence of the marker + indicates to users that truncation occurred. Note that when + truncated, the description has less than 4096 characters from the + original string. + - name: --sequence-number + type: string + short-summary: The sequence number for this health report as a + numeric string. + long-summary: The report sequence number is used by the health store + to detect stale reports. If not specified, a sequence number is + auto-generated by the health client when a report is added. + - name: --remove-when-expired + type: bool + short-summary: Value that indicates whether the report is removed + from health store when it expires. + long-summary: If set to true, the report is removed from the health + store after it expires. If set to false, the report is treated as + an error when expired. The value of this property is false by + default. When clients report periodically, they should set + RemoveWhenExpired false (default). This way, is the reporter has + issues (eg. deadlock) and can't report, the entity is evaluated at + error when the health report expires. This flags the entity as + being in Error health state. + - name: --immediate + type: bool + short-summary: A flag which indicates whether the report should be + sent immediately. + long-summary: A health report is sent to a Service Fabric gateway + Application, which forwards to the health store. If Immediate is + set to true, the report is sent immediately from Http Gateway to + the health store, regardless of the fabric client settings that + the Http Gateway Application is using. This is useful for critical + reports that should be sent as soon as possible. Depending on + timing and other conditions, sending the report may still fail, + for example if the Http Gateway is closed or the message doesn't + reach the Gateway. If Immediate is set to false, the report is sent + based on the health client settings from the Http Gateway. + Therefore, it will be batched according to the + HealthReportSendInterval configuration. This is the recommended + setting because it allows the health client to optimize health + reporting messages to health store as well as health report + processing. By default, reports are not sent immediately. +""" + +helps['service report-health'] = """ + type: command + short-summary: Sends a health report on the Service Fabric service + long-summary: Reports health state of the specified Service Fabric service. + The report must contain the information about the source of the health + report and property on which it is reported. The report is sent to a + Service Fabric gateway Service, which forwards to the health store. + The report may be accepted by the gateway, but rejected by the health + store after extra validation. For example, the health store may reject + the report because of an invalid parameter, like a stale sequence number. + To see whether the report was applied in the health store, check that the + report appears in the health events of the service. + parameters: + - name: --service-id + type: string + short-summary: The identity of the service. + long-summary: "This is typically the full name of the service without + the 'fabric:' URI scheme. Starting from version 6.0, hierarchical + names are delimited with the '~' character. For example, if the + service name is 'fabric://myapp/app1/svc1', the service identity + would be 'myapp~app1~svc1' in 6.0+ and 'myapp/app1/svc1' in + previous versions." + - name: --source-id + type: string + short-summary: The source name which identifies the + client/watchdog/system component which generated the health + information. + - name: --health-property + type: string + short-summary: The property of the health information. + long-summary: An entity can have health reports for different + properties. The property is a string and not a fixed enumeration to + allow the reporter flexibility to categorize the state condition that + triggers the report. For example, a reporter with SourceId + "LocalWatchdog" can monitor the state of the available disk on a + node, so it can report "AvailableDisk" property on that node. The + same reporter can monitor the node connectivity, so it can report a + property "Connectivity" on the same node. In the health store, + these reports are treated as separate health events for the + specified node. Together with the SourceId, the property uniquely + identifies the health information. + - name: --health-state + type: string + short-summary: "Possible values include: 'Invalid', 'Ok', 'Warning', + 'Error', 'Unknown'" + - name: --ttl + type: string + short-summary: The duration for which this health report is valid. + This field is using ISO8601 format for specifying the duration. + long-summary: When clients report periodically, they should send + reports with higher frequency than time to live. If clients report + on transition, they can set the time to live to infinite. When + time to live expires, the health event that contains the health + information is either removed from health store, if + RemoveWhenExpired is true, or evaluated at error, if + RemoveWhenExpired false. If not specified, time to live defaults + to infinite value. + - name: --description + type: string + short-summary: The description of the health information. + long-summary: It represents free text used to add human readable + information about the report. The maximum string length for the + description is 4096 characters. If the provided string is longer, + it will be automatically truncated. When truncated, the last + characters of the description contain a marker "[Truncated]", and + total string size is 4096 characters. The presence of the marker + indicates to users that truncation occurred. Note that when + truncated, the description has less than 4096 characters from the + original string. + - name: --sequence-number + type: string + short-summary: The sequence number for this health report as a + numeric string. + long-summary: The report sequence number is used by the health store + to detect stale reports. If not specified, a sequence number is + auto-generated by the health client when a report is added. + - name: --remove-when-expired + type: bool + short-summary: Value that indicates whether the report is removed + from health store when it expires. + long-summary: If set to true, the report is removed from the health + store after it expires. If set to false, the report is treated as + an error when expired. The value of this property is false by + default. When clients report periodically, they should set + RemoveWhenExpired false (default). This way, is the reporter has + issues (eg. deadlock) and can't report, the entity is evaluated at + error when the health report expires. This flags the entity as + being in Error health state. + - name: --immediate + type: bool + short-summary: A flag which indicates whether the report should be + sent immediately. + long-summary: A health report is sent to a Service Fabric gateway + Application, which forwards to the health store. If Immediate is + set to true, the report is sent immediately from Http Gateway to + the health store, regardless of the fabric client settings that + the Http Gateway Application is using. This is useful for critical + reports that should be sent as soon as possible. Depending on + timing and other conditions, sending the report may still fail, + for example if the Http Gateway is closed or the message doesn't + reach the Gateway. If Immediate is set to false, the report is sent + based on the health client settings from the Http Gateway. + Therefore, it will be batched according to the + HealthReportSendInterval configuration. This is the recommended + setting because it allows the health client to optimize health + reporting messages to health store as well as health report + processing. By default, reports are not sent immediately. +""" + +helps['partition report-health'] = """ + type: command + short-summary: Sends a health report on the Service Fabric partition. + long-summary: Reports health state of the specified Service Fabric + partition. The report must contain the information about the source of + the health report and property on which it is reported. The report is + sent to a Service Fabric gateway Partition, which forwards to the + health store. The report may be accepted by the gateway, but rejected + by the health store after extra validation. For example, the health + store may reject the report because of an invalid parameter, like a + stale sequence number. To see whether the report was applied in the + health store, check that the report appears in the events section. + parameters: + - name: --partition-id + type: string + short-summary: The identity of the partition + - name: --source-id + type: string + short-summary: The source name which identifies the + client/watchdog/system component which generated the health + information. + - name: --health-property + type: string + short-summary: The property of the health information. + long-summary: An entity can have health reports for different + properties. The property is a string and not a fixed enumeration to + allow the reporter flexibility to categorize the state condition that + triggers the report. For example, a reporter with SourceId + "LocalWatchdog" can monitor the state of the available disk on a + node, so it can report "AvailableDisk" property on that node. The + same reporter can monitor the node connectivity, so it can report a + property "Connectivity" on the same node. In the health store, + these reports are treated as separate health events for the + specified node. Together with the SourceId, the property uniquely + identifies the health information. + - name: --health-state + type: string + short-summary: "Possible values include: 'Invalid', 'Ok', 'Warning', + 'Error', 'Unknown'" + - name: --ttl + type: string + short-summary: The duration for which this health report is valid. + This field is using ISO8601 format for specifying the duration. + long-summary: When clients report periodically, they should send + reports with higher frequency than time to live. If clients report + on transition, they can set the time to live to infinite. When + time to live expires, the health event that contains the health + information is either removed from health store, if + RemoveWhenExpired is true, or evaluated at error, if + RemoveWhenExpired false. If not specified, time to live defaults + to infinite value. + - name: --description + type: string + short-summary: The description of the health information. + long-summary: It represents free text used to add human readable + information about the report. The maximum string length for the + description is 4096 characters. If the provided string is longer, + it will be automatically truncated. When truncated, the last + characters of the description contain a marker "[Truncated]", and + total string size is 4096 characters. The presence of the marker + indicates to users that truncation occurred. Note that when + truncated, the description has less than 4096 characters from the + original string. + - name: --sequence-number + type: string + short-summary: The sequence number for this health report as a + numeric string. + long-summary: The report sequence number is used by the health store + to detect stale reports. If not specified, a sequence number is + auto-generated by the health client when a report is added. + - name: --remove-when-expired + type: bool + short-summary: Value that indicates whether the report is removed + from health store when it expires. + long-summary: If set to true, the report is removed from the health + store after it expires. If set to false, the report is treated as + an error when expired. The value of this property is false by + default. When clients report periodically, they should set + RemoveWhenExpired false (default). This way, is the reporter has + issues (eg. deadlock) and can't report, the entity is evaluated at + error when the health report expires. This flags the entity as + being in Error health state. + - name: --immediate + type: bool + short-summary: A flag which indicates whether the report should be + sent immediately. + long-summary: A health report is sent to a Service Fabric gateway + Application, which forwards to the health store. If Immediate is + set to true, the report is sent immediately from Http Gateway to + the health store, regardless of the fabric client settings that + the Http Gateway Application is using. This is useful for critical + reports that should be sent as soon as possible. Depending on + timing and other conditions, sending the report may still fail, + for example if the Http Gateway is closed or the message doesn't + reach the Gateway. If Immediate is set to false, the report is sent + based on the health client settings from the Http Gateway. + Therefore, it will be batched according to the + HealthReportSendInterval configuration. This is the recommended + setting because it allows the health client to optimize health + reporting messages to health store as well as health report + processing. By default, reports are not sent immediately. +""" + +helps['replica report-health'] = """ + type: command + short-summary: Sends a health report on the Service Fabric replica + long-summary: Reports health state of the specified Service Fabric replica. + The report must contain the information about the source of the health + report and property on which it is reported. The report is sent to a + Service Fabric gateway Replica, which forwards to the health store. The + report may be accepted by the gateway, but rejected by the health store + after extra validation. For example, the health store may reject the + report because of an invalid parameter, like a stale sequence number. + To see whether the report was applied in the health store, check that + the report appears in the events section. + parameters: + - name: --partition-id + type: string + short-summary: The identity of the partition + - name: --replica-id + type: string + short-summary: The identity of the partition + - name: --service-kind + type: string + short-summary: "The kind of service replica (stateless or stateful) + for which the health is being reported. Following are the possible + values: 'Stateless', 'Stateful'." + - name: --source-id + type: string + short-summary: The source name which identifies the + client/watchdog/system component which generated the health + information. + - name: --health-property + type: string + short-summary: The property of the health information. + long-summary: An entity can have health reports for different + properties. The property is a string and not a fixed enumeration to + allow the reporter flexibility to categorize the state condition that + triggers the report. For example, a reporter with SourceId + "LocalWatchdog" can monitor the state of the available disk on a + node, so it can report "AvailableDisk" property on that node. The + same reporter can monitor the node connectivity, so it can report a + property "Connectivity" on the same node. In the health store, + these reports are treated as separate health events for the + specified node. Together with the SourceId, the property uniquely + identifies the health information. + - name: --health-state + type: string + short-summary: "Possible values include: 'Invalid', 'Ok', 'Warning', + 'Error', 'Unknown'" + - name: --ttl + type: string + short-summary: The duration for which this health report is valid. + This field is using ISO8601 format for specifying the duration. + long-summary: When clients report periodically, they should send + reports with higher frequency than time to live. If clients report + on transition, they can set the time to live to infinite. When + time to live expires, the health event that contains the health + information is either removed from health store, if + RemoveWhenExpired is true, or evaluated at error, if + RemoveWhenExpired false. If not specified, time to live defaults + to infinite value. + - name: --description + type: string + short-summary: The description of the health information. + long-summary: It represents free text used to add human readable + information about the report. The maximum string length for the + description is 4096 characters. If the provided string is longer, + it will be automatically truncated. When truncated, the last + characters of the description contain a marker "[Truncated]", and + total string size is 4096 characters. The presence of the marker + indicates to users that truncation occurred. Note that when + truncated, the description has less than 4096 characters from the + original string. + - name: --sequence-number + type: string + short-summary: The sequence number for this health report as a + numeric string. + long-summary: The report sequence number is used by the health store + to detect stale reports. If not specified, a sequence number is + auto-generated by the health client when a report is added. + - name: --remove-when-expired + type: bool + short-summary: Value that indicates whether the report is removed + from health store when it expires. + long-summary: If set to true, the report is removed from the health + store after it expires. If set to false, the report is treated as + an error when expired. The value of this property is false by + default. When clients report periodically, they should set + RemoveWhenExpired false (default). This way, is the reporter has + issues (eg. deadlock) and can't report, the entity is evaluated at + error when the health report expires. This flags the entity as + being in Error health state. + - name: --immediate + type: bool + short-summary: A flag which indicates whether the report should be + sent immediately. + long-summary: A health report is sent to a Service Fabric gateway + Application, which forwards to the health store. If Immediate is + set to true, the report is sent immediately from Http Gateway to + the health store, regardless of the fabric client settings that + the Http Gateway Application is using. This is useful for critical + reports that should be sent as soon as possible. Depending on + timing and other conditions, sending the report may still fail, + for example if the Http Gateway is closed or the message doesn't + reach the Gateway. If Immediate is set to false, the report is sent + based on the health client settings from the Http Gateway. + Therefore, it will be batched according to the + HealthReportSendInterval configuration. This is the recommended + setting because it allows the health client to optimize health + reporting messages to health store as well as health report + processing. By default, reports are not sent immediately. +""" diff --git a/src/sfctl/helps/main.py b/src/sfctl/helps/main.py index 8b932530..289ace66 100644 --- a/src/sfctl/helps/main.py +++ b/src/sfctl/helps/main.py @@ -11,9 +11,20 @@ helps[''] = """ type: group short-summary: Commands for managing Service Fabric clusters - and entities + and entities. This version is compatible with Service Fabric 6.0 + runtime. long-summary: Commands follow the noun-verb pattern. See subgroups for more - information + information. +""" + +helps['rpm'] = """ + type: group + short-summary: Query and send commands to the repair manager service +""" + +helps['sa-cluster'] = """ + type: group + short-summary: Manage stand-alone Service Fabric clusters """ helps['application'] = """ diff --git a/src/sfctl/params.py b/src/sfctl/params.py index e1f5427f..b5836dcd 100644 --- a/src/sfctl/params.py +++ b/src/sfctl/params.py @@ -28,6 +28,9 @@ def custom_arguments(self, _): #pylint: disable=too-many-statements arg_context.argument('min_node_count', type=int) arg_context.argument('max_node_count', type=int) + with ArgumentsContext(self, 'application list') as arg_context: + arg_context.argument('application_definition_kind_filter', type=int) + with ArgumentsContext(self, 'application upgrade') as arg_context: arg_context.argument('parameters', type=json_encoded) arg_context.argument('default_service_health_policy', @@ -108,9 +111,43 @@ def custom_arguments(self, _): #pylint: disable=too-many-statements with ArgumentsContext(self, 'application type-list') as arg_context: arg_context.argument('max_results', type=int) + arg_context.argument('application_type_definition_kind_filter', + type=int) with ArgumentsContext(self, 'application type') as arg_context: arg_context.argument('max_results', type=int) with ArgumentsContext(self, 'compose list') as arg_context: arg_context.argument('max_results', type=int) + + with ArgumentsContext(self, 'cluster upgrade') as arg_context: + arg_context.argument('replica_set_check_timeout', type=int) + arg_context.argument('unhealthy_nodes', type=int) + arg_context.argument('unhealthy_applications', type=int) + arg_context.argument('app_type_health_map', type=json_encoded) + arg_context.argument('delta_unhealthy_nodes', type=int) + arg_context.argument('upgrade_domain_delta_unhealthy_nodes', type=int) + arg_context.argument('app_health_map', type=json_encoded) + + with ArgumentsContext(self, 'sa-cluster config-upgrade') as arg_context: + arg_context.argument('unhealthy_applications', type=int) + arg_context.argument('unhealthy_nodes', type=int) + arg_context.argument('delta_unhealthy_nodes', type=int) + arg_context.argument('upgrade_domain_delta_unhealthy_nodes', type=int) + + with ArgumentsContext(self, 'cluster upgrade-update') as arg_context: + arg_context.argument('replica_set_check_timeout', type=int) + arg_context.argument('unhealthy_nodes', type=int) + arg_context.argument('unhealthy_applications', type=int) + arg_context.argument('app_type_health_map', type=json_encoded) + arg_context.argument('delta_unhealthy_nodes', type=int) + arg_context.argument('upgrade_domain_delta_unhealthy_nodes', type=int) + arg_context.argument('app_health_map', type=json_encoded) + + with ArgumentsContext(self, 'rpm list') as arg_context: + arg_context.argument('state_filter', type=int) + + with ArgumentsContext(self, 'compose upgrade') as arg_context: + arg_context.argument('unhealthy_app', type=int) + arg_context.argument('default_svc_type_health_map', type=json_encoded) + arg_context.argument('svc_type_health_map', type=json_encoded)