Skip to content

Commit

Permalink
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel…
Browse files Browse the repository at this point in the history
…/git/sage/ceph-client

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (28 commits)
  ceph: update discussion list address in MAINTAINERS
  ceph: some documentations fixes
  ceph: fix use after free on mds __unregister_request
  ceph: avoid loaded term 'OSD' in documention
  ceph: fix possible double-free of mds request reference
  ceph: fix session check on mds reply
  ceph: handle kmalloc() failure
  ceph: propagate mds session allocation failures to caller
  ceph: make write_begin wait propagate ERESTARTSYS
  ceph: fix snap rebuild condition
  ceph: avoid reopening osd connections when address hasn't changed
  ceph: rename r_sent_stamp r_stamp
  ceph: fix connection fault con_work reentrancy problem
  ceph: prevent dup stale messages to console for restarting mds
  ceph: fix pg pool decoding from incremental osdmap update
  ceph: fix mds sync() race with completing requests
  ceph: only release unused caps with mds requests
  ceph: clean up handle_cap_grant, handle_caps wrt session mutex
  ceph: fix session locking in handle_caps, ceph_check_caps
  ceph: drop unnecessary WARN_ON in caps migration
  ...
  • Loading branch information
torvalds committed Mar 29, 2010
2 parents 9d54e2c + 82593f8 commit 9f32160
Show file tree
Hide file tree
Showing 15 changed files with 191 additions and 97 deletions.
2 changes: 2 additions & 0 deletions Documentation/filesystems/00-INDEX
Expand Up @@ -16,6 +16,8 @@ befs.txt
- information about the BeOS filesystem for Linux.
bfs.txt
- info for the SCO UnixWare Boot Filesystem (BFS).
ceph.txt
- info for the Ceph Distributed File System
cifs.txt
- description of the CIFS filesystem.
coda.txt
Expand Down
11 changes: 6 additions & 5 deletions Documentation/filesystems/ceph.txt
Expand Up @@ -8,7 +8,7 @@ Basic features include:

* POSIX semantics
* Seamless scaling from 1 to many thousands of nodes
* High availability and reliability. No single points of failure.
* High availability and reliability. No single point of failure.
* N-way replication of data across storage nodes
* Fast recovery from node failures
* Automatic rebalancing of data on node addition/removal
Expand Down Expand Up @@ -94,7 +94,7 @@ Mount Options

wsize=X
Specify the maximum write size in bytes. By default there is no
maximu. Ceph will normally size writes based on the file stripe
maximum. Ceph will normally size writes based on the file stripe
size.

rsize=X
Expand All @@ -115,7 +115,7 @@ Mount Options
number of entries in that directory.

nocrc
Disable CRC32C calculation for data writes. If set, the OSD
Disable CRC32C calculation for data writes. If set, the storage node
must rely on TCP's error correction to detect data corruption
in the data payload.

Expand All @@ -133,7 +133,8 @@ For more information on Ceph, see the home page at
http://ceph.newdream.net/

The Linux kernel client source tree is available at
git://ceph.newdream.net/linux-ceph-client.git
git://ceph.newdream.net/git/ceph-client.git
git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git

and the source for the full system is at
git://ceph.newdream.net/ceph.git
git://ceph.newdream.net/git/ceph.git
2 changes: 1 addition & 1 deletion MAINTAINERS
Expand Up @@ -1443,7 +1443,7 @@ F: arch/powerpc/platforms/cell/

CEPH DISTRIBUTED FILE SYSTEM CLIENT
M: Sage Weil <sage@newdream.net>
L: ceph-devel@lists.sourceforge.net
L: ceph-devel@vger.kernel.org
W: http://ceph.newdream.net/
T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
S: Supported
Expand Down
10 changes: 8 additions & 2 deletions fs/ceph/addr.c
Expand Up @@ -919,6 +919,10 @@ static int context_is_writeable_or_written(struct inode *inode,
/*
* We are only allowed to write into/dirty the page if the page is
* clean, or already dirty within the same snap context.
*
* called with page locked.
* return success with page locked,
* or any failure (incl -EAGAIN) with page unlocked.
*/
static int ceph_update_writeable_page(struct file *file,
loff_t pos, unsigned len,
Expand Down Expand Up @@ -961,9 +965,11 @@ static int ceph_update_writeable_page(struct file *file,
snapc = ceph_get_snap_context((void *)page->private);
unlock_page(page);
ceph_queue_writeback(inode);
wait_event_interruptible(ci->i_cap_wq,
r = wait_event_interruptible(ci->i_cap_wq,
context_is_writeable_or_written(inode, snapc));
ceph_put_snap_context(snapc);
if (r == -ERESTARTSYS)
return r;
return -EAGAIN;
}

Expand Down Expand Up @@ -1035,7 +1041,7 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
int r;

do {
/* get a page*/
/* get a page */
page = grab_cache_page_write_begin(mapping, index, 0);
if (!page)
return -ENOMEM;
Expand Down
53 changes: 38 additions & 15 deletions fs/ceph/auth_x.c
Expand Up @@ -28,6 +28,12 @@ static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
return (ac->want_keys & xi->have_keys) == ac->want_keys;
}

static int ceph_x_encrypt_buflen(int ilen)
{
return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
sizeof(u32);
}

static int ceph_x_encrypt(struct ceph_crypto_key *secret,
void *ibuf, int ilen, void *obuf, size_t olen)
{
Expand Down Expand Up @@ -150,6 +156,11 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
struct timespec validity;
struct ceph_crypto_key old_key;
void *tp, *tpend;
struct ceph_timespec new_validity;
struct ceph_crypto_key new_session_key;
struct ceph_buffer *new_ticket_blob;
unsigned long new_expires, new_renew_after;
u64 new_secret_id;

ceph_decode_need(&p, end, sizeof(u32) + 1, bad);

Expand Down Expand Up @@ -182,16 +193,16 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
goto bad;

memcpy(&old_key, &th->session_key, sizeof(old_key));
ret = ceph_crypto_key_decode(&th->session_key, &dp, dend);
ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
if (ret)
goto out;

ceph_decode_copy(&dp, &th->validity, sizeof(th->validity));
ceph_decode_timespec(&validity, &th->validity);
th->expires = get_seconds() + validity.tv_sec;
th->renew_after = th->expires - (validity.tv_sec / 4);
dout(" expires=%lu renew_after=%lu\n", th->expires,
th->renew_after);
ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
ceph_decode_timespec(&validity, &new_validity);
new_expires = get_seconds() + validity.tv_sec;
new_renew_after = new_expires - (validity.tv_sec / 4);
dout(" expires=%lu renew_after=%lu\n", new_expires,
new_renew_after);

/* ticket blob for service */
ceph_decode_8_safe(&p, end, is_enc, bad);
Expand All @@ -216,10 +227,21 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
dout(" ticket blob is %d bytes\n", dlen);
ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
struct_v = ceph_decode_8(&tp);
th->secret_id = ceph_decode_64(&tp);
ret = ceph_decode_buffer(&th->ticket_blob, &tp, tpend);
new_secret_id = ceph_decode_64(&tp);
ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
if (ret)
goto out;

/* all is well, update our ticket */
ceph_crypto_key_destroy(&th->session_key);
if (th->ticket_blob)
ceph_buffer_put(th->ticket_blob);
th->session_key = new_session_key;
th->ticket_blob = new_ticket_blob;
th->validity = new_validity;
th->secret_id = new_secret_id;
th->expires = new_expires;
th->renew_after = new_renew_after;
dout(" got ticket service %d (%s) secret_id %lld len %d\n",
type, ceph_entity_type_name(type), th->secret_id,
(int)th->ticket_blob->vec.iov_len);
Expand All @@ -242,7 +264,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
struct ceph_x_ticket_handler *th,
struct ceph_x_authorizer *au)
{
int len;
int maxlen;
struct ceph_x_authorize_a *msg_a;
struct ceph_x_authorize_b msg_b;
void *p, *end;
Expand All @@ -253,15 +275,15 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
dout("build_authorizer for %s %p\n",
ceph_entity_type_name(th->service), au);

len = sizeof(*msg_a) + sizeof(msg_b) + sizeof(u32) +
ticket_blob_len + 16;
dout(" need len %d\n", len);
if (au->buf && au->buf->alloc_len < len) {
maxlen = sizeof(*msg_a) + sizeof(msg_b) +
ceph_x_encrypt_buflen(ticket_blob_len);
dout(" need len %d\n", maxlen);
if (au->buf && au->buf->alloc_len < maxlen) {
ceph_buffer_put(au->buf);
au->buf = NULL;
}
if (!au->buf) {
au->buf = ceph_buffer_new(len, GFP_NOFS);
au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
if (!au->buf)
return -ENOMEM;
}
Expand Down Expand Up @@ -296,6 +318,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
au->buf->vec.iov_len = p - au->buf->vec.iov_base;
dout(" built authorizer nonce %llx len %d\n", au->nonce,
(int)au->buf->vec.iov_len);
BUG_ON(au->buf->vec.iov_len > maxlen);
return 0;

out_buf:
Expand Down
73 changes: 39 additions & 34 deletions fs/ceph/caps.c
Expand Up @@ -1407,14 +1407,14 @@ static int try_nonblocking_invalidate(struct inode *inode)
*/
void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct ceph_mds_session *session)
__releases(session->s_mutex)
{
struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
struct ceph_mds_client *mdsc = &client->mdsc;
struct inode *inode = &ci->vfs_inode;
struct ceph_cap *cap;
int file_wanted, used;
int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
int drop_session_lock = session ? 0 : 1;
int issued, implemented, want, retain, revoking, flushing = 0;
int mds = -1; /* keep track of how far we've gone through i_caps list
to avoid an infinite loop on retry */
Expand Down Expand Up @@ -1639,7 +1639,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
if (queue_invalidate)
ceph_queue_invalidate(inode);

if (session && drop_session_lock)
if (session)
mutex_unlock(&session->s_mutex);
if (took_snap_rwsem)
up_read(&mdsc->snap_rwsem);
Expand Down Expand Up @@ -2195,18 +2195,19 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
* Handle a cap GRANT message from the MDS. (Note that a GRANT may
* actually be a revocation if it specifies a smaller cap set.)
*
* caller holds s_mutex.
* caller holds s_mutex and i_lock, we drop both.
*
* return value:
* 0 - ok
* 1 - check_caps on auth cap only (writeback)
* 2 - check_caps (ack revoke)
*/
static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
struct ceph_mds_session *session,
struct ceph_cap *cap,
struct ceph_buffer *xattr_buf)
static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
struct ceph_mds_session *session,
struct ceph_cap *cap,
struct ceph_buffer *xattr_buf)
__releases(inode->i_lock)

__releases(session->s_mutex)
{
struct ceph_inode_info *ci = ceph_inode(inode);
int mds = session->s_mds;
Expand All @@ -2216,7 +2217,7 @@ static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
u64 size = le64_to_cpu(grant->size);
u64 max_size = le64_to_cpu(grant->max_size);
struct timespec mtime, atime, ctime;
int reply = 0;
int check_caps = 0;
int wake = 0;
int writeback = 0;
int revoked_rdcache = 0;
Expand Down Expand Up @@ -2329,11 +2330,12 @@ static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER)
writeback = 1; /* will delay ack */
else if (dirty & ~newcaps)
reply = 1; /* initiate writeback in check_caps */
check_caps = 1; /* initiate writeback in check_caps */
else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 ||
revoked_rdcache)
reply = 2; /* send revoke ack in check_caps */
check_caps = 2; /* send revoke ack in check_caps */
cap->issued = newcaps;
cap->implemented |= newcaps;
} else if (cap->issued == newcaps) {
dout("caps unchanged: %s -> %s\n",
ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
Expand All @@ -2346,6 +2348,7 @@ static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
* pending revocation */
wake = 1;
}
BUG_ON(cap->issued & ~cap->implemented);

spin_unlock(&inode->i_lock);
if (writeback)
Expand All @@ -2359,7 +2362,14 @@ static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
ceph_queue_invalidate(inode);
if (wake)
wake_up(&ci->i_cap_wq);
return reply;

if (check_caps == 1)
ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
session);
else if (check_caps == 2)
ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
else
mutex_unlock(&session->s_mutex);
}

/*
Expand Down Expand Up @@ -2548,9 +2558,8 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
ci->i_cap_exporting_issued = cap->issued;
}
__ceph_remove_cap(cap);
} else {
WARN_ON(!cap);
}
/* else, we already released it */

spin_unlock(&inode->i_lock);
}
Expand Down Expand Up @@ -2621,9 +2630,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
u64 cap_id;
u64 size, max_size;
u64 tid;
int check_caps = 0;
void *snaptrace;
int r;

dout("handle_caps from mds%d\n", mds);

Expand Down Expand Up @@ -2668,8 +2675,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
case CEPH_CAP_OP_IMPORT:
handle_cap_import(mdsc, inode, h, session,
snaptrace, le32_to_cpu(h->snap_trace_len));
check_caps = 1; /* we may have sent a RELEASE to the old auth */
goto done;
ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY,
session);
goto done_unlocked;
}

/* the rest require a cap */
Expand All @@ -2686,16 +2694,8 @@ void ceph_handle_caps(struct ceph_mds_session *session,
switch (op) {
case CEPH_CAP_OP_REVOKE:
case CEPH_CAP_OP_GRANT:
r = handle_cap_grant(inode, h, session, cap, msg->middle);
if (r == 1)
ceph_check_caps(ceph_inode(inode),
CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
session);
else if (r == 2)
ceph_check_caps(ceph_inode(inode),
CHECK_CAPS_NODELAY,
session);
break;
handle_cap_grant(inode, h, session, cap, msg->middle);
goto done_unlocked;

case CEPH_CAP_OP_FLUSH_ACK:
handle_cap_flush_ack(inode, tid, h, session, cap);
Expand All @@ -2713,9 +2713,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,

done:
mutex_unlock(&session->s_mutex);

if (check_caps)
ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, NULL);
done_unlocked:
if (inode)
iput(inode);
return;
Expand Down Expand Up @@ -2838,11 +2836,18 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
struct ceph_cap *cap;
struct ceph_mds_request_release *rel = *p;
int ret = 0;

dout("encode_inode_release %p mds%d drop %s unless %s\n", inode,
mds, ceph_cap_string(drop), ceph_cap_string(unless));
int used = 0;

spin_lock(&inode->i_lock);
used = __ceph_caps_used(ci);

dout("encode_inode_release %p mds%d used %s drop %s unless %s\n", inode,
mds, ceph_cap_string(used), ceph_cap_string(drop),
ceph_cap_string(unless));

/* only drop unused caps */
drop &= ~used;

cap = __get_cap_for_mds(ci, mds);
if (cap && __cap_is_valid(cap)) {
if (force ||
Expand Down
4 changes: 3 additions & 1 deletion fs/ceph/dir.c
Expand Up @@ -288,8 +288,10 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;

/* discard old result, if any */
if (fi->last_readdir)
if (fi->last_readdir) {
ceph_mdsc_put_request(fi->last_readdir);
fi->last_readdir = NULL;
}

/* requery frag tree, as the frag topology may have changed */
frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);
Expand Down

0 comments on commit 9f32160

Please sign in to comment.