Skip to content

Commit

Permalink
BHYVE: OS-7300 bhyve should not sync IO unless necessary
Browse files Browse the repository at this point in the history
OS-7416 bhyve BLOCKIF_NUMTHR should keep pace with MAXREQ
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Mike Gerdts <mike.gerdts@joyent.com>
Approved by: Mike Gerdts <mike.gerdts@joyent.com>
  • Loading branch information
pfmooney authored and hadfl committed Dec 7, 2018
1 parent c44ed9b commit 6f5ad7e
Show file tree
Hide file tree
Showing 5 changed files with 179 additions and 58 deletions.
152 changes: 129 additions & 23 deletions usr/src/cmd/bhyve/block_if.c
Expand Up @@ -46,6 +46,9 @@ __FBSDID("$FreeBSD$");
#include <sys/disk.h>
#include <sys/limits.h>
#include <sys/uio.h>
#ifndef __FreeBSD__
#include <sys/dkio.h>
#endif

#include <assert.h>
#include <err.h>
Expand All @@ -69,11 +72,12 @@ __FBSDID("$FreeBSD$");

#define BLOCKIF_SIG 0xb109b109

#define BLOCKIF_NUMTHR 8
#ifdef __FreeBSD__
#define BLOCKIF_NUMTHR 8
#define BLOCKIF_MAXREQ (64 + BLOCKIF_NUMTHR)
#else
/* Enlarge to keep pace with the virtio-block ring size */
#define BLOCKIF_NUMTHR 16
#define BLOCKIF_MAXREQ (128 + BLOCKIF_NUMTHR)
#endif

Expand Down Expand Up @@ -104,12 +108,23 @@ struct blockif_elem {
off_t be_block;
};

#ifndef __FreeBSD__
enum blockif_wce {
WCE_NONE = 0,
WCE_IOCTL,
WCE_FCNTL
};
#endif

struct blockif_ctxt {
int bc_magic;
int bc_fd;
int bc_ischr;
int bc_isgeom;
int bc_candelete;
#ifndef __FreeBSD__
enum blockif_wce bc_wce;
#endif
int bc_rdonly;
off_t bc_size;
int bc_sectsz;
Expand Down Expand Up @@ -230,8 +245,6 @@ blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf)
struct blockif_req *br;
#ifdef __FreeBSD__
off_t arg[2];
#else
boolean_t sync = B_FALSE;
#endif
ssize_t clen, len, off, boff, voff;
int i, err;
Expand Down Expand Up @@ -277,11 +290,6 @@ blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf)
br->br_resid -= len;
}
break;
#ifndef __FreeBSD__
case BOP_WRITE_SYNC:
sync = B_TRUE;
#endif
/* FALLTHROUGH */
case BOP_WRITE:
if (bc->bc_rdonly) {
err = EROFS;
Expand Down Expand Up @@ -330,6 +338,12 @@ blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf)
} else if (fsync(bc->bc_fd))
err = errno;
#else
/*
* This fsync() should be adequate to flush the cache of a file
* or device. In VFS, the VOP_SYNC operation is converted to
* the appropriate ioctl in both sdev (for real devices) and
* zfs (for zvols).
*/
if (fsync(bc->bc_fd))
err = errno;
#endif
Expand Down Expand Up @@ -357,11 +371,6 @@ blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf)
break;
}

#ifndef __FreeBSD__
if (sync && err == 0 && fsync(bc->bc_fd) < 0)
err = errno;
#endif

be->be_status = BST_DONE;

(*br->br_callback)(br, err);
Expand Down Expand Up @@ -456,6 +465,8 @@ blockif_open(const char *optstr, const char *ident)
struct stat sbuf;
#ifdef __FreeBSD__
struct diocgattr_arg arg;
#else
enum blockif_wce wce = WCE_NONE;
#endif
off_t size, psectsz, psectoff;
int extra, fd, i, sectsz;
Expand Down Expand Up @@ -555,9 +566,66 @@ blockif_open(const char *optstr, const char *ident)
candelete = arg.value.i;
if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0)
geom = 1;
} else
#endif
} else {
psectsz = sbuf.st_blksize;
}
#else
psectsz = sbuf.st_blksize;
if (S_ISCHR(sbuf.st_mode)) {
struct dk_minfo_ext dkmext;
int wce_val;

/* Look for a more accurate physical blocksize */
if (ioctl(fd, DKIOCGMEDIAINFOEXT, &dkmext) == 0) {
psectsz = dkmext.dki_pbsize;
}
/* See if a configurable write cache is present and working */
if (ioctl(fd, DKIOCGETWCE, &wce_val) == 0) {
/*
* If WCE is already active, disable it until the
* specific device driver calls for its return. If it
* is not active, toggle it on and off to verify that
* such actions are possible.
*/
if (wce_val != 0) {
wce_val = 0;
/*
* Inability to disable the cache is a threat
* to data durability.
*/
assert(ioctl(fd, DKIOCSETWCE, &wce_val) == 0);
wce = WCE_IOCTL;
} else {
int r1, r2;

wce_val = 1;
r1 = ioctl(fd, DKIOCSETWCE, &wce_val);
wce_val = 0;
r2 = ioctl(fd, DKIOCSETWCE, &wce_val);

if (r1 == 0 && r2 == 0) {
wce = WCE_IOCTL;
} else {
/*
* If the cache cache toggle was not
* successful, ensure that the cache
* was not left enabled.
*/
assert(r1 != 0);
}
}
}
} else {
int flags;

if ((flags = fcntl(fd, F_GETFL)) >= 0) {
flags |= O_DSYNC;
if (fcntl(fd, F_SETFL, flags) != -1) {
wce = WCE_FCNTL;
}
}
}
#endif

#ifndef WITHOUT_CAPSICUM
if (cap_ioctls_limit(fd, cmds, nitems(cmds)) == -1 && errno != ENOSYS)
Expand Down Expand Up @@ -604,6 +672,9 @@ blockif_open(const char *optstr, const char *ident)
bc->bc_ischr = S_ISCHR(sbuf.st_mode);
bc->bc_isgeom = geom;
bc->bc_candelete = candelete;
#ifndef __FreeBSD__
bc->bc_wce = wce;
#endif
bc->bc_rdonly = ro;
bc->bc_size = size;
bc->bc_sectsz = sectsz;
Expand Down Expand Up @@ -671,19 +742,11 @@ blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq)
}

int
#ifdef __FreeBSD__
blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq)
#else
blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq, boolean_t sync)
#endif
{

assert(bc->bc_magic == BLOCKIF_SIG);
#ifdef __FreeBSD__
return (blockif_request(bc, breq, BOP_WRITE));
#else
return (blockif_request(bc, breq, sync ? BOP_WRITE_SYNC : BOP_WRITE));
#endif
}

int
Expand Down Expand Up @@ -908,3 +971,46 @@ blockif_candelete(struct blockif_ctxt *bc)
assert(bc->bc_magic == BLOCKIF_SIG);
return (bc->bc_candelete);
}

#ifndef __FreeBSD__
int
blockif_set_wce(struct blockif_ctxt *bc, int wc_enable)
{
int res = 0, flags;
int clean_val = (wc_enable != 0) ? 1 : 0;

(void) pthread_mutex_lock(&bc->bc_mtx);
switch (bc->bc_wce) {
case WCE_IOCTL:
res = ioctl(bc->bc_fd, DKIOCSETWCE, &clean_val);
break;
case WCE_FCNTL:
if ((flags = fcntl(bc->bc_fd, F_GETFL)) >= 0) {
if (wc_enable == 0) {
flags |= O_DSYNC;
} else {
flags &= ~O_DSYNC;
}
if (fcntl(bc->bc_fd, F_SETFL, flags) == -1) {
res = -1;
}
} else {
res = -1;
}
break;
default:
break;
}

/*
* After a successful disable of the write cache, ensure that any
* lingering data in the cache is synced out.
*/
if (res == 0 && wc_enable == 0) {
res = fsync(bc->bc_fd);
}
(void) pthread_mutex_unlock(&bc->bc_mtx);

return (res);
}
#endif /* __FreeBSD__ */
8 changes: 3 additions & 5 deletions usr/src/cmd/bhyve/block_if.h
Expand Up @@ -71,13 +71,11 @@ void blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off);
int blockif_queuesz(struct blockif_ctxt *bc);
int blockif_is_ro(struct blockif_ctxt *bc);
int blockif_candelete(struct blockif_ctxt *bc);
#ifndef __FreeBSD__
int blockif_set_wce(struct blockif_ctxt *bc, int enable);
#endif
int blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq);
#ifdef __FreeBSD__
int blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq);
#else
int blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq,
boolean_t sync);
#endif
int blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq);
int blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq);
int blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq);
Expand Down
20 changes: 11 additions & 9 deletions usr/src/cmd/bhyve/pci_ahci.c
Expand Up @@ -738,16 +738,7 @@ ahci_handle_rw(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done)
if (readop)
err = blockif_read(p->bctx, breq);
else
#ifdef __FreeBSD__
err = blockif_write(p->bctx, breq);
#else
/*
* XXX: We currently don't handle disabling of the write cache.
* Once this is fixed we need to revisit this code to do sync
* writes when the cache is disabled.
*/
err = blockif_write(p->bctx, breq, B_FALSE);
#endif
assert(err == 0);
}

Expand Down Expand Up @@ -2376,6 +2367,17 @@ pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts, int atapi)
sc->port[p].port = p;
sc->port[p].atapi = atapi;

#ifndef __FreeBSD__
/*
* Attempt to enable the write cache for this device, as the
* guest will issue FLUSH commands when it requires durability.
*
* Failure here is fine, since an always-sync device will not
* have an impact on correctness.
*/
(void) blockif_set_wce(bctxt, 1);
#endif

/*
* Create an identifier for the backing file.
* Use parts of the md5 sum of the filename
Expand Down
15 changes: 0 additions & 15 deletions usr/src/cmd/bhyve/pci_nvme.c
Expand Up @@ -1013,17 +1013,8 @@ pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
err = blockif_read(sc->nvstore.ctx,
&req->io_req);
else
#ifdef __FreeBSD__
err = blockif_write(sc->nvstore.ctx,
&req->io_req);
#else
err = blockif_write(sc->nvstore.ctx,
&req->io_req, B_FALSE);
/*
* XXX: Is a follow-up needed for proper sync
* detection here or later flush behavior?
*/
#endif

/* wait until req completes before cont */
if (err == 0)
Expand Down Expand Up @@ -1368,13 +1359,7 @@ pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
err = blockif_read(sc->nvstore.ctx, &req->io_req);
break;
case NVME_OPC_WRITE:
#ifdef __FreeBSD__
err = blockif_write(sc->nvstore.ctx, &req->io_req);
#else
/* XXX: Should this be sync? */
err = blockif_write(sc->nvstore.ctx, &req->io_req,
B_FALSE);
#endif
break;
default:
WPRINTF(("%s unhandled io command 0x%x\r\n",
Expand Down

0 comments on commit 6f5ad7e

Please sign in to comment.