Skip to content

Commit

Permalink
Added support for automatic disconnect when a dynamic cluster node is…
Browse files Browse the repository at this point in the history
… shut down

When the dynamic cluster configuration has scale for certain
occasions, it makes sense to provide down scaling support when these
occasions are over, which does not rely on the configured cluster
disconnect timeout (ClusterAutodeleteInterval). The new code will send
automatically a disconnect request when a dynamic cluster node is
terminating gracefully.

Depending on the configuration, a new version of NaviServer will be
necessary to reliably execute disconnect requests. Appropriate changes
are in the NaviServer release/4.99 and main branches.

- Bumped version numbers:
  * acs-tcl    to 5.10.1b4
  * acs-admin  to 5.10.1b4
  • Loading branch information
gustafn committed Dec 17, 2023
1 parent 1a7a765 commit 7cbc3e6
Show file tree
Hide file tree
Showing 5 changed files with 104 additions and 60 deletions.
6 changes: 3 additions & 3 deletions packages/acs-admin/acs-admin.info
Expand Up @@ -9,7 +9,7 @@
<implements-subsite-p>f</implements-subsite-p>
<inherit-templates-p>t</inherit-templates-p>

<version name="5.10.1b3" url="http://openacs.org/repository/download/apm/acs-admin-5.10.1b3.apm">
<version name="5.10.1b4" url="http://openacs.org/repository/download/apm/acs-admin-5.10.1b4.apm">
<owner url="mailto:dhogaza@pacifier.com">Don Baccus</owner>
<summary>An interface for Site-wide administration of an OpenACS Installation.</summary>
<release-date>2023-02-08</release-date>
Expand All @@ -20,9 +20,9 @@
<license>GPL</license>
<maturity>3</maturity>

<provides url="acs-admin" version="5.10.1b3"/>
<provides url="acs-admin" version="5.10.1b4"/>
<requires url="acs-kernel" version="5.10.1b4"/>
<requires url="acs-tcl" version="5.10.1b3"/>
<requires url="acs-tcl" version="5.10.1b4"/>
<requires url="acs-templating" version="5.10.1b1"/>
<requires url="acs-mail-lite" version="5.10.1b1"/>
<requires url="acs-authentication" version="5.10.1b1"/>
Expand Down
10 changes: 5 additions & 5 deletions packages/acs-admin/www/cluster.tcl
Expand Up @@ -3,7 +3,7 @@ ad_page_contract {

@creation-date Feb 8, 2023
} {
{drop_node:nohtml,notnull ""}
{disconnect_node:nohtml,notnull ""}
{flush_node:nohtml,notnull ""}
}

Expand All @@ -13,11 +13,11 @@ set context [list $page_title]
set server_cluster_enabled_p [server_cluster_enabled_p]
set dynamic_cluster_nodes [::acs::cluster dynamic_cluster_nodes]

if {$drop_node ne ""} {
if {$disconnect_node ne ""} {
#
# Drop the provided node from DynamicClusterPeers
# Disconnect the provided node from DynamicClusterPeers
#
acs::cluster drop_dynamic_node $drop_node
acs::cluster dynamic_cluster_reconfigure disconnect $disconnect_node
set done 1
} elseif {$flush_node ne ""} {
#
Expand Down Expand Up @@ -100,7 +100,7 @@ if {$server_cluster_enabled_p} {
<a href="./cluster?flush_node=@cluster_nodes.node_name@"><adp:icon name="bandaid" title="Flush Cache"></a>&nbsp;
<if @cluster_nodes.current_p@ true><adp:icon name="trash" invisible="true"></if>
<else><if @cluster_nodes.canonical_p@ true><adp:icon name="trash" invisible="true"></if>
<else><a href="./cluster?drop_node=@cluster_nodes.node_name@"><adp:icon name="trash"
<else><a href="./cluster?disconnect_node=@cluster_nodes.node_name@"><adp:icon name="trash"
title="Disconnect Peer; trigger rejoin and flush in a few seconds when server is alive"></a></else>
</else>
}
Expand Down
4 changes: 2 additions & 2 deletions packages/acs-tcl/acs-tcl.info
Expand Up @@ -9,7 +9,7 @@
<implements-subsite-p>f</implements-subsite-p>
<inherit-templates-p>t</inherit-templates-p>

<version name="5.10.1b3" url="http://openacs.org/repository/download/apm/acs-tcl-5.10.1b3.apm">
<version name="5.10.1b4" url="http://openacs.org/repository/download/apm/acs-tcl-5.10.1b4.apm">
<owner url="http://openacs.org">OpenACS</owner>
<summary>The Kernel Tcl API library.</summary>
<release-date>2023-05-15</release-date>
Expand All @@ -18,7 +18,7 @@
<license>GPL version 2</license>
<maturity>3</maturity>

<provides url="acs-tcl" version="5.10.1b3"/>
<provides url="acs-tcl" version="5.10.1b4"/>
<requires url="acs-bootstrap-installer" version="5.10.1b1"/>
<requires url="acs-kernel" version="5.10.1b4"/>

Expand Down
28 changes: 19 additions & 9 deletions packages/acs-tcl/tcl/cluster-init.tcl
Expand Up @@ -77,18 +77,28 @@ if {[server_cluster_enabled_p]} {
::acs::cluster register_nodes -startup

ns_atstartup {
#
# We could add some code for testing actively keep-alive
# status.
#
ns_log notice "CHECK ::throttle '[::info commands ::throttle]'"
if {0 && [::info commands ::throttle] ne ""} {
ns_log notice "CHECK calling ::acs::cluster check_nodes"
throttle do ::acs::cluster check_nodes
ns_log notice "acs::cluster starting:" \
"running as canonical server [::acs::cluster current_server_is_canonical_server]," \
"cluster nodes: [nsv_get cluster cluster_peer_nodes]"
}

#
# Register callback for shutdown operations. When the shutdown is
# performed at a dynamic cluster node, disconnect the node from the
# cluster.
#
ns_atshutdown {
if {[::acs::cluster current_server_is_canonical_server]} {
ns_log notice "acs::cluster: shutdown canonical server"
} elseif {[::acs::cluster current_server_is_dynamic_cluster_peer]} {
ns_log notice "acs::cluster: shutdown dynamic cluster peer (perform disconnect operation)"
acs::cluster send_disconnect_request_to_canonical_server
} else {
ns_log notice "acs::cluster: shutdown static cluster peer"
}
}
}
ns_log notice "cluster-init done"

#
# Local variables:
# mode: tcl
Expand Down
116 changes: 75 additions & 41 deletions packages/acs-tcl/tcl/cluster-procs.tcl
Expand Up @@ -103,6 +103,7 @@ namespace eval ::acs {
acs::cache_flush_all ""
acs::cache_flush_pattern ""
::acs::cluster "^::acs::cluster\s+join_request"
::acs::cluster "^::acs::cluster\s+disconnect_request"
}

#
Expand Down Expand Up @@ -309,25 +310,7 @@ namespace eval ::acs {
-package_id $::acs::kernel_id \
-parameter DynamicClusterPeers]
}

:public method drop_dynamic_node {node} {
#
# Drop the provided node from DynamicClusterPeers
#
set dynamic_cluster_nodes [:dynamic_cluster_nodes]
set p [lsearch $dynamic_cluster_nodes $node]
if {$p != -1} {
set cluster_nodes [lreplace $dynamic_cluster_nodes $p $p]
parameter::set_value \
-package_id $::acs::kernel_id \
-parameter DynamicClusterPeers \
-value $cluster_nodes
} else {
ns_log warning "cluster: can't drop node '$node': not in the" \
"dynamic cluster configuration: $dynamic_cluster_nodes"
}
}


:public method check_state {} {
#
# Check the livelyness of the dynamic cluster nodes. This
Expand All @@ -339,14 +322,14 @@ namespace eval ::acs {
-package_id $::acs::kernel_id \
-parameter ClusterAutodeleteInterval \
-default 2m]

foreach node [:dynamic_cluster_nodes] {
set last_contact [acs::cluster last_contact $node]
if {$last_contact ne ""} {
set seconds [expr {$last_contact/1000}]
if {[clock seconds]-($last_contact/1000) > [ns_baseunit -time $autodeleteInterval]} {
ns_log notice "[self] drop dynamic node $node due to ClusterAutodeleteInterval"
:drop_dynamic_node $node
ns_log notice "[self] disconnect dynamic node $node due to ClusterAutodeleteInterval"
:disconnect_dynamic_node $node
}
}
}
Expand Down Expand Up @@ -394,7 +377,7 @@ namespace eval ::acs {
} {
ns_log warning "cluster node is not listed in dynamic peers." \
"Must re-join canonical server: ${:canonicalServerLocation}"
:send_join_request ${:canonicalServerLocation}
:send_join_request_to_canonical_server
}
}

Expand Down Expand Up @@ -534,7 +517,7 @@ namespace eval ::acs {
return $result
}

:method current_server_is_dynamic_cluster_peer {} {
:public method current_server_is_dynamic_cluster_peer {} {
#
# We are a dynamic cluster peer, when we are not the
# canonical server neither isted in the static server
Expand Down Expand Up @@ -655,46 +638,84 @@ namespace eval ::acs {
return $result
}

:public method send_join_request {location} {
:method send_dynamic_cluster_reconfigure_request {operation} {
#
# Send a join request to the canonical server.
# Send a cluster reconfigure request to the canonical server.
#
:log "send_join_request to $location"
set r [:send $location [self] join_request ${:currentServerLocation}]
#:log "... join_request returned $r"
set location ${:canonicalServerLocation}
:log "send $operation request to $location"
set r [:send $location [self] ${operation}_request ${:currentServerLocation}]
#:log "... $operation request returned $r"

if {[dict exists $r body]} {
#
# During startup/separation caches might not be in
# sync. Therefore, we have lost confidence in our
# caches and clear these.
#
:log "send_join_request returned [dict get $r body], flushing all my caches"
:log "$operation request returned [dict get $r body], flushing all my caches"
acs::cache_flush_all
}
}

:public method join_request {peerLocation} -returns boolean {
:public method send_join_request_to_canonical_server {} {
#
# Send a join request to the canonical server.
#
:send_dynamic_cluster_reconfigure_request join
}

:public method send_disconnect_request_to_canonical_server {} {
#
# A join request was received
# Send a disconnect request to the canonical server.
#
ns_log notice "Cluster join_request from '$peerLocation'"
:send_dynamic_cluster_reconfigure_request disconnect
}

:public method dynamic_cluster_reconfigure {operation qualifiedLocation} -returns boolean {
#
# Reconfigure the cluster via "join" or "disconnect" operation,
# when running on the canonical server. The result of the
# reconfiguration is a changed list of
# DynamicClusterPeers. The method returns a boolean value
# indicating success.
#
ns_log notice "Cluster reconfigure $operation from '$qualifiedLocation'"

set success 1
#
# Was the join request received by a canonical server?
# To be ultra-conservative, we could allow cluster
# reconfigure operations only on the canonical
# server. This would require also to alter the
# acs-admin/cluster page to show the trash icon only when
# the page is executed on the canonical server.
#
if {![:current_server_is_canonical_server]} {
ns_log warning "Cluster join_request rejected," \
if {0 && ![:current_server_is_canonical_server]} {
ns_log warning "Cluster reconfigure rejected," \
"since it was received by a non-canonical server"
set success 0
} else {
#
# We know, we are running on the canonical server, an
# We know, we are running on the canonical server, and
# we know that the request is trustworthy.
#
ns_log notice "Cluster join_request $peerLocation accepted from $peerLocation"
ns_log notice "Cluster reconfigure $qualifiedLocation accepted from $qualifiedLocation"
set dynamicClusterNodes [:dynamic_cluster_nodes]
set dynamicClusterNodes [lsort -unique [concat $dynamicClusterNodes [:qualified_location $peerLocation]]]
switch $operation {
"join" {
set dynamicClusterNodes \
[lsort -unique [concat $dynamicClusterNodes $qualifiedLocation]]
}
"disconnect" {
set dynamicClusterNodes \
[lsearch -inline -all -not -exact $dynamicClusterNodes $qualifiedLocation]
}
default {
ns_log warning "Cluster reconfigure rejected," \
"received invalid operation '$operation'"
return 0
}
}
#
# The parameter::set_value operation causes a
# clusterwide cache-flush for the parameters
Expand All @@ -703,11 +724,24 @@ namespace eval ::acs {
-package_id $::acs::kernel_id \
-parameter DynamicClusterPeers \
-value $dynamicClusterNodes
ns_log notice "[self] Cluster join_request leads to DynamicClusterPeers $dynamicClusterNodes"
ns_log notice "[self] reconfigure $operation leads to DynamicClusterPeers $dynamicClusterNodes"
}
return $success
}

:public method join_request {peerLocation} -returns boolean {
#
# Server received a request to join dynamic cluster nodes from $peerLocation.
#
return [:dynamic_cluster_reconfigure join [:qualified_location $peerLocation]]
}

:public method disconnect_request {peerLocation} -returns boolean {
#
# Server received a request to disconnect $peerLocation from dynamic cluster nodes.
#
return [:dynamic_cluster_reconfigure disconnect [:qualified_location $peerLocation]]
}

:method peer_nodes {dynamic_peers} {
#
Expand Down Expand Up @@ -820,7 +854,7 @@ namespace eval ::acs {
ns_log notice "Current host ${:currentServerLocation} is not included in ${:configured_cluster_hosts}"
if {![:current_server_is_canonical_server]} {
ns_log notice "... must join at canonical server ${:canonicalServerLocation}"
:send_join_request ${:canonicalServerLocation}
:send_join_request_to_canonical_server
}
} else {
#ns_log notice "Current host ${:currentServerLocation} is included in ${:configured_cluster_hosts}"
Expand Down

0 comments on commit 7cbc3e6

Please sign in to comment.