-
Notifications
You must be signed in to change notification settings - Fork 881
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix reapTime logic in NetworkDB #1944
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -38,3 +38,4 @@ cmd/dnet/dnet | |
|
||
libnetworkbuild.created | ||
test/networkDb/testMain | ||
test/networkDb/gossipdb |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,9 @@ | ||
package networkdb | ||
|
||
import ( | ||
"fmt" | ||
"net" | ||
"strings" | ||
"time" | ||
|
||
"github.com/gogo/protobuf/proto" | ||
"github.com/sirupsen/logrus" | ||
|
@@ -198,8 +198,7 @@ func (nDB *NetworkDB) handleNetworkEvent(nEvent *NetworkEvent) bool { | |
} | ||
|
||
func (nDB *NetworkDB) handleTableEvent(tEvent *TableEvent) bool { | ||
// Update our local clock if the received messages has newer | ||
// time. | ||
// Update our local clock if the received messages has newer time. | ||
nDB.tableClock.Witness(tEvent.LTime) | ||
|
||
// Ignore the table events for networks that are in the process of going away | ||
|
@@ -235,20 +234,26 @@ func (nDB *NetworkDB) handleTableEvent(tEvent *TableEvent) bool { | |
node: tEvent.NodeName, | ||
value: tEvent.Value, | ||
deleting: tEvent.Type == TableEventTypeDelete, | ||
reapTime: time.Duration(tEvent.ResidualReapTime) * time.Second, | ||
} | ||
|
||
if e.deleting { | ||
// All the entries marked for deletion should have a reapTime set greater than 0 | ||
// This case can happen if the cluster is running different versions of the engine where the old version does not have the | ||
// field. If that is not the case, this can be a BUG | ||
if e.deleting && e.reapTime == 0 { | ||
logrus.Warnf("handleTableEvent object %+v has a 0 reapTime, is the cluster running the same docker engine version?", tEvent) | ||
e.reapTime = reapInterval | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this change of behavior expected here ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nope, this is actually the point of this patch, the reapTime should be the one passed in the proto (line 237). The old behavior was such that no matter when you get the notification the clock was being set to the max value of 30 min There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Except in that very unfortunate circumstance that the ResidualReapTime populated by the peer has infact gone down to exactly 0 seconds at the time of populating the message (https://github.com/docker/libnetwork/pull/1944/files#diff-ced1ed36c3141cb15bdf5acafed78d67R583). If that happens in such a rare situation, we will reset this back to 30 seconds and the event will get cleaned up in 30 seconds (or in the more rarest of occasions we get another genuine 0 seconds ResidualReapTime). @fcrisciani is that correct ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. BTW. yes. This is the actual point of this patch and I think it is a good idea to get this done. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @mavenugo your point is correct only for the case of the truncation of the number. I will add an extra check that the number truncated is still strictly greater than 0. |
||
} | ||
|
||
nDB.Lock() | ||
nDB.indexes[byTable].Insert(fmt.Sprintf("/%s/%s/%s", tEvent.TableName, tEvent.NetworkID, tEvent.Key), e) | ||
nDB.indexes[byNetwork].Insert(fmt.Sprintf("/%s/%s/%s", tEvent.NetworkID, tEvent.TableName, tEvent.Key), e) | ||
nDB.createOrUpdateEntry(tEvent.NetworkID, tEvent.TableName, tEvent.Key, e) | ||
nDB.Unlock() | ||
|
||
if err != nil && tEvent.Type == TableEventTypeDelete { | ||
// If it is a delete event and we didn't have the entry here don't repropagate | ||
return true | ||
// If it is a delete event and we did not have a state for it, don't propagate to the application | ||
// If the residual reapTime is lower or equal to 1/6 of the total reapTime don't bother broadcasting it around | ||
// most likely the cluster is already aware of it, if not who will sync with this node will catch the state too. | ||
return e.reapTime > reapPeriod/6 | ||
} | ||
|
||
var op opType | ||
|
@@ -303,22 +308,17 @@ func (nDB *NetworkDB) handleTableMessage(buf []byte, isBulkSync bool) { | |
n, ok := nDB.networks[nDB.config.NodeName][tEvent.NetworkID] | ||
nDB.RUnlock() | ||
|
||
if !ok { | ||
return | ||
} | ||
|
||
broadcastQ := n.tableBroadcasts | ||
|
||
if broadcastQ == nil { | ||
// if the network is not there anymore, OR we are leaving the network OR the broadcast queue is not present | ||
if !ok || n.leaving || n.tableBroadcasts == nil { | ||
return | ||
} | ||
|
||
broadcastQ.QueueBroadcast(&tableEventMessage{ | ||
n.tableBroadcasts.QueueBroadcast(&tableEventMessage{ | ||
msg: buf, | ||
id: tEvent.NetworkID, | ||
tname: tEvent.TableName, | ||
key: tEvent.Key, | ||
node: nDB.config.NodeName, | ||
node: tEvent.NodeName, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Being a rebroadcast event and based on the current logic of depending on the residual reaptimer, what is the purpose of this change ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if I understood correctly flavio's explanation, it's not strictly related to this PR. it's an issue he saw while he was looking at the code. Without this change, when the event was rebroadcasted, the node original node emitting the event was replaced with the one rebroadcasting it. So you could receive event saying it originated from one node, but actually came from another one. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The node is simply rebroadcasting the message of another node, at the top of the function there is the check that skips messages where the owner matches with the current node. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok. this is a bit subtle and I think a good testing coverage will help mitigate the risk. |
||
}) | ||
} | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -141,6 +141,11 @@ type network struct { | |
|
||
// Number of gossip messages sent related to this network during the last stats collection period | ||
qMessagesSent int | ||
|
||
// Number of entries on the network. This value is the sum of all the entries of all the tables of a specific network. | ||
// Its use is for statistics purposes. It keep tracks of database size and is printed per network every StatsPrintPeriod | ||
// interval | ||
entriesNumber int | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess this is used only for debugging purposes... It would be good to write a comment reflecting the purpose and we should not be tempted to use this as a reference counter for any logic in the future. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. will add extra comments |
||
} | ||
|
||
// Config represents the configuration of the networdb instance and | ||
|
@@ -338,8 +343,7 @@ func (nDB *NetworkDB) CreateEntry(tname, nid, key string, value []byte) error { | |
} | ||
|
||
nDB.Lock() | ||
nDB.indexes[byTable].Insert(fmt.Sprintf("/%s/%s/%s", tname, nid, key), entry) | ||
nDB.indexes[byNetwork].Insert(fmt.Sprintf("/%s/%s/%s", nid, tname, key), entry) | ||
nDB.createOrUpdateEntry(nid, tname, key, entry) | ||
nDB.Unlock() | ||
|
||
return nil | ||
|
@@ -365,8 +369,7 @@ func (nDB *NetworkDB) UpdateEntry(tname, nid, key string, value []byte) error { | |
} | ||
|
||
nDB.Lock() | ||
nDB.indexes[byTable].Insert(fmt.Sprintf("/%s/%s/%s", tname, nid, key), entry) | ||
nDB.indexes[byNetwork].Insert(fmt.Sprintf("/%s/%s/%s", nid, tname, key), entry) | ||
nDB.createOrUpdateEntry(nid, tname, key, entry) | ||
nDB.Unlock() | ||
|
||
return nil | ||
|
@@ -410,8 +413,7 @@ func (nDB *NetworkDB) DeleteEntry(tname, nid, key string) error { | |
} | ||
|
||
nDB.Lock() | ||
nDB.indexes[byTable].Insert(fmt.Sprintf("/%s/%s/%s", tname, nid, key), entry) | ||
nDB.indexes[byNetwork].Insert(fmt.Sprintf("/%s/%s/%s", nid, tname, key), entry) | ||
nDB.createOrUpdateEntry(nid, tname, key, entry) | ||
nDB.Unlock() | ||
|
||
return nil | ||
|
@@ -473,7 +475,7 @@ func (nDB *NetworkDB) deleteNodeNetworkEntries(nid, node string) { | |
|
||
entry := &entry{ | ||
ltime: oldEntry.ltime, | ||
node: node, | ||
node: oldEntry.node, | ||
value: oldEntry.value, | ||
deleting: true, | ||
reapTime: reapInterval, | ||
|
@@ -488,12 +490,10 @@ func (nDB *NetworkDB) deleteNodeNetworkEntries(nid, node string) { | |
// without doing a delete of all the objects | ||
entry.ltime++ | ||
} | ||
nDB.indexes[byTable].Insert(fmt.Sprintf("/%s/%s/%s", tname, nid, key), entry) | ||
nDB.indexes[byNetwork].Insert(fmt.Sprintf("/%s/%s/%s", nid, tname, key), entry) | ||
nDB.createOrUpdateEntry(nid, tname, key, entry) | ||
} else { | ||
// the local node is leaving the network, all the entries of remote nodes can be safely removed | ||
nDB.indexes[byTable].Delete(fmt.Sprintf("/%s/%s/%s", tname, nid, key)) | ||
nDB.indexes[byNetwork].Delete(fmt.Sprintf("/%s/%s/%s", nid, tname, key)) | ||
nDB.deleteEntry(nid, tname, key) | ||
} | ||
|
||
nDB.broadcaster.Write(makeEvent(opDelete, tname, nid, key, entry.value)) | ||
|
@@ -513,8 +513,7 @@ func (nDB *NetworkDB) deleteNodeTableEntries(node string) { | |
nid := params[1] | ||
key := params[2] | ||
|
||
nDB.indexes[byTable].Delete(fmt.Sprintf("/%s/%s/%s", tname, nid, key)) | ||
nDB.indexes[byNetwork].Delete(fmt.Sprintf("/%s/%s/%s", nid, tname, key)) | ||
nDB.deleteEntry(nid, tname, key) | ||
|
||
nDB.broadcaster.Write(makeEvent(opDelete, tname, nid, key, oldEntry.value)) | ||
return false | ||
|
@@ -558,7 +557,12 @@ func (nDB *NetworkDB) JoinNetwork(nid string) error { | |
nodeNetworks = make(map[string]*network) | ||
nDB.networks[nDB.config.NodeName] = nodeNetworks | ||
} | ||
nodeNetworks[nid] = &network{id: nid, ltime: ltime} | ||
n, ok := nodeNetworks[nid] | ||
var entries int | ||
if ok { | ||
entries = n.entriesNumber | ||
} | ||
nodeNetworks[nid] = &network{id: nid, ltime: ltime, entriesNumber: entries} | ||
nodeNetworks[nid].tableBroadcasts = &memberlist.TransmitLimitedQueue{ | ||
NumNodes: func() int { | ||
nDB.RLock() | ||
|
@@ -567,6 +571,7 @@ func (nDB *NetworkDB) JoinNetwork(nid string) error { | |
}, | ||
RetransmitMult: 4, | ||
} | ||
|
||
nDB.addNetworkNode(nid, nDB.config.NodeName) | ||
networkNodes := nDB.networkNodes[nid] | ||
nDB.Unlock() | ||
|
@@ -679,3 +684,33 @@ func (nDB *NetworkDB) updateLocalNetworkTime() { | |
n.ltime = ltime | ||
} | ||
} | ||
|
||
// createOrUpdateEntry this function handles the creation or update of entries into the local | ||
// tree store. It is also used to keep in sync the entries number of the network (all tables are aggregated) | ||
func (nDB *NetworkDB) createOrUpdateEntry(nid, tname, key string, entry interface{}) (bool, bool) { | ||
_, okTable := nDB.indexes[byTable].Insert(fmt.Sprintf("/%s/%s/%s", tname, nid, key), entry) | ||
_, okNetwork := nDB.indexes[byNetwork].Insert(fmt.Sprintf("/%s/%s/%s", nid, tname, key), entry) | ||
if !okNetwork { | ||
// Add only if it is an insert not an update | ||
n, ok := nDB.networks[nDB.config.NodeName][nid] | ||
if ok { | ||
n.entriesNumber++ | ||
} | ||
} | ||
return okTable, okNetwork | ||
} | ||
|
||
// deleteEntry this function handles the deletion of entries into the local tree store. | ||
// It is also used to keep in sync the entries number of the network (all tables are aggregated) | ||
func (nDB *NetworkDB) deleteEntry(nid, tname, key string) (bool, bool) { | ||
_, okTable := nDB.indexes[byTable].Delete(fmt.Sprintf("/%s/%s/%s", tname, nid, key)) | ||
_, okNetwork := nDB.indexes[byNetwork].Delete(fmt.Sprintf("/%s/%s/%s", nid, tname, key)) | ||
if okNetwork { | ||
// Remove only if the delete is successful | ||
n, ok := nDB.networks[nDB.config.NodeName][nid] | ||
if ok { | ||
n.entriesNumber-- | ||
} | ||
} | ||
return okTable, okNetwork | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
are we 100% sure the path will have 4
/
in it ?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why 4? it is actually 3 like:
fmt.Sprintf("/%s/%s/%s", nid, tname, key)
Also the insertion and deletion is now contained in single functions: createOrUpdateEntry and deleteEntry
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sorry I meant 3
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes. this is the current expectation in the code and as @fcrisciani mentioned it is all organized now under the same functions and this assumption is valid.