Skip to content

Commit

Permalink
mds: ensure next replay is queued on req drop
Browse files Browse the repository at this point in the history
Not all client replay requests are queued at once since [1]. We require
the next request by queued when completed (unsafely) or during cleanup.
Not all code paths seem to handle this [2] so move it to a generic
location, MDCache::request_cleanup. Even so, this doesn't handle all
errors (so we must still be careful) as sometimes we must queue the next
replay request before an MDRequest is constructed [3] during some error
conditions.

Additionally, preserve the behavior of Server::journal_and_reply
queueing the next replay op. Otherwise, must wait for the request to be
durable before moving onto the next one, unnecessarily.

For reproducing, two specific cases are highlighted (thanks to @Mer1997 on
Github for locating these):

- The request is killed by a session close / eviction while a replayed request
  is queued and waiting for a journal flush (e.g. dirty inest locks).

- The request construction fails because the request is already in the
  active_requests. This could happen theoretically if a client resends the same
  request (same reqid) twice.

The first case is most probable but very difficult to reproduce for testing
purposes. The replayed op would need to wait on a journal flush (to be
restarted by C_MDS_RetryRequest).  Then, the request would need killed by a
session close.

[1] ed6a18d
[2] https://github.com/ceph/ceph/blob/a6f1a1c6c09d74f5918c715b05789f34f2ea0e90/src/mds/Server.cc#L2253-L2262
[3] https://github.com/ceph/ceph/blob/a6f1a1c6c09d74f5918c715b05789f34f2ea0e90/src/mds/Server.cc#L2380

Fixes: https://tracker.ceph.com/issues/56577
Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
(cherry picked from commit 078ecaa)

Conflicts:
	src/mds/Mutation.h: lock dump changes not backported
	src/mds/Server.cc: minor code change
  • Loading branch information
batrick committed Nov 22, 2023
1 parent 6e93de6 commit 7243b68
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 17 deletions.
6 changes: 6 additions & 0 deletions src/mds/MDCache.cc
Expand Up @@ -9724,6 +9724,12 @@ void MDCache::request_cleanup(MDRequestRef& mdr)
// remove from map
active_requests.erase(mdr->reqid);

// queue next replay op?
if (mdr->is_queued_for_replay() && !mdr->get_queued_next_replay_op()) {
mdr->set_queued_next_replay_op();
mds->queue_one_replay();
}

if (mds->logger)
log_stat();

Expand Down
2 changes: 2 additions & 0 deletions src/mds/MDSRank.cc
Expand Up @@ -2055,13 +2055,15 @@ bool MDSRank::queue_one_replay()
if (!replay_queue.empty()) {
queue_waiter(replay_queue.front());
replay_queue.pop_front();
dout(10) << " queued next replay op" << dendl;
return true;
}
if (!replaying_requests_done) {
replaying_requests_done = true;
mdlog->flush();
}
maybe_clientreplay_done();
dout(10) << " journaled last replay op" << dendl;
return false;
}

Expand Down
7 changes: 7 additions & 0 deletions src/mds/Mutation.h
Expand Up @@ -387,6 +387,12 @@ struct MDRequestImpl : public MutationImpl {
void set_filepath(const filepath& fp);
void set_filepath2(const filepath& fp);
bool is_queued_for_replay() const;
bool get_queued_next_replay_op() const {
return queued_next_replay_op;
}
void set_queued_next_replay_op() {
queued_next_replay_op = true;
}
int compare_paths();

bool can_batch();
Expand Down Expand Up @@ -456,6 +462,7 @@ struct MDRequestImpl : public MutationImpl {
void _dump_op_descriptor_unlocked(std::ostream& stream) const override;
private:
mutable ceph::spinlock msg_lock;
bool queued_next_replay_op = false;
};

struct MDPeerUpdate {
Expand Down
42 changes: 25 additions & 17 deletions src/mds/Server.cc
Expand Up @@ -299,6 +299,7 @@ void Server::dispatch(const cref_t<Message> &m)
return;
}
bool queue_replay = false;
dout(5) << "dispatch request in up:reconnect: " << *req << dendl;
if (req->is_replay() || req->is_async()) {
dout(3) << "queuing replayed op" << dendl;
queue_replay = true;
Expand All @@ -317,10 +318,13 @@ void Server::dispatch(const cref_t<Message> &m)
// process completed request in clientreplay stage. The completed request
// might have created new file/directorie. This guarantees MDS sends a reply
// to client before other request modifies the new file/directorie.
if (session->have_completed_request(req->get_reqid().tid, NULL)) {
dout(3) << "queuing completed op" << dendl;
bool r = session->have_completed_request(req->get_reqid().tid, NULL);
if (r) {
dout(3) << __func__ << ": queuing completed op" << dendl;
queue_replay = true;
}
} else {
dout(20) << __func__ << ": request not complete" << dendl;
}
// this request was created before the cap reconnect message, drop any embedded
// cap releases.
req->releases.clear();
Expand Down Expand Up @@ -1975,13 +1979,16 @@ void Server::journal_and_reply(MDRequestRef& mdr, CInode *in, CDentry *dn, LogEv

mdr->committing = true;
submit_mdlog_entry(le, fin, mdr, __func__);

if (mdr->client_request && mdr->client_request->is_queued_for_replay()) {
if (mds->queue_one_replay()) {
dout(10) << " queued next replay op" << dendl;
} else {
dout(10) << " journaled last replay op" << dendl;
}

if (mdr->is_queued_for_replay()) {

/* We want to queue the next replay op while waiting for the journaling, so
* do it now when the early (unsafe) replay is dispatched. Don't wait until
* this request is cleaned up in MDCache.cc.
*/

mdr->set_queued_next_replay_op();
mds->queue_one_replay();
} else if (mdr->did_early_reply)
mds->locker->drop_rdlocks_for_early_reply(mdr.get());
else
Expand Down Expand Up @@ -2282,15 +2289,12 @@ void Server::reply_client_request(MDRequestRef& mdr, const ref_t<MClientReply> &
mds->send_message_client(reply, session);
}

if (req->is_queued_for_replay() &&
(mdr->has_completed || reply->get_result() < 0)) {
if (reply->get_result() < 0) {
int r = reply->get_result();
if (req->is_queued_for_replay()) {
if (int r = reply->get_result(); r < 0) {
derr << "reply_client_request: failed to replay " << *req
<< " error " << r << " (" << cpp_strerror(r) << ")" << dendl;
<< " error " << r << " (" << cpp_strerror(r) << ")" << dendl;
mds->clog->warn() << "failed to replay " << req->get_reqid() << " error " << r;
}
mds->queue_one_replay();
}

// clean up request
Expand Down Expand Up @@ -2488,8 +2492,12 @@ void Server::handle_client_request(const cref_t<MClientRequest> &req)

// register + dispatch
MDRequestRef mdr = mdcache->request_start(req);
if (!mdr.get())
if (!mdr.get()) {
dout(5) << __func__ << ": possibly duplicate op " << *req << dendl;
if (req->is_queued_for_replay())
mds->queue_one_replay();
return;
}

if (session) {
mdr->session = session;
Expand Down

0 comments on commit 7243b68

Please sign in to comment.