Skip to content

Commit

Permalink
fix: Clean offline node from resource group after qc restart (#33232)
Browse files Browse the repository at this point in the history
issue: #33200 #33207
pr#33104 causes the offline node will be kept in resource group after qc
recover, and offline node will be assign to new replica as rwNode, then
request send to those node will fail by NodeNotFound.

Signed-off-by: Wei Liu <wei.liu@zilliz.com>
  • Loading branch information
weiliu1031 committed May 22, 2024
1 parent 3d105fc commit 303470f
Showing 1 changed file with 10 additions and 23 deletions.
33 changes: 10 additions & 23 deletions internal/querycoordv2/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -456,7 +456,7 @@ func (s *Server) startQueryCoord() error {
s.nodeMgr.Stopping(node.ServerID)
}
}
s.checkReplicas()
s.checkNodeStateInRG()
for _, node := range sessions {
s.handleNodeUp(node.ServerID)
}
Expand Down Expand Up @@ -778,28 +778,15 @@ func (s *Server) handleNodeDown(node int64) {
s.meta.ResourceManager.HandleNodeDown(node)
}

// checkReplicas checks whether replica contains offline node, and remove those nodes
func (s *Server) checkReplicas() {
for _, collection := range s.meta.CollectionManager.GetAll() {
log := log.With(zap.Int64("collectionID", collection))
replicas := s.meta.ReplicaManager.GetByCollection(collection)
for _, replica := range replicas {
toRemove := make([]int64, 0)
for _, node := range replica.GetNodes() {
if s.nodeMgr.Get(node) == nil {
toRemove = append(toRemove, node)
}
}

if len(toRemove) > 0 {
log := log.With(
zap.Int64("replicaID", replica.GetID()),
zap.Int64s("offlineNodes", toRemove),
)
log.Info("some nodes are offline, remove them from replica", zap.Any("toRemove", toRemove))
if err := s.meta.ReplicaManager.RemoveNode(replica.GetID(), toRemove...); err != nil {
log.Warn("failed to remove offline nodes from replica")
}
func (s *Server) checkNodeStateInRG() {
for _, rgName := range s.meta.ListResourceGroups() {
rg := s.meta.ResourceManager.GetResourceGroup(rgName)
for _, node := range rg.GetNodes() {
info := s.nodeMgr.Get(node)
if info == nil {
s.meta.ResourceManager.HandleNodeDown(node)
} else if info.IsStoppingState() {
s.meta.ResourceManager.HandleNodeStopping(node)
}
}
}
Expand Down

0 comments on commit 303470f

Please sign in to comment.