Skip to content

Commit a35eb4e

Browse files
committed
Add vectored I/O support for Windows.
Open vfds in "overlapped" mode, and teach pg_pread()/pg_pwrite() to wait if necessary. Add pg_direct_preadv()/pg_direct_pwritev() as synonyms for pg_preadv()/pg_pwritev() when using a direct I/O file handle, but on Windows redirect them to ReadFileScatter() and WriteFileGather(). Teach fd.c to use those alternative names if a File was opened with PG_O_DIRECT, since it remembers that for vfds. This is enough for io_method=sync and io_method=worker to benefit from true vectored I/O for buffer pool I/O when debug_io_direct=data is used on this platform. Later patches for native AIO need this too. Windows doesn't have vectored APIs for buffered I/O, and still falls back to the loop-based emulation in that case.
1 parent 83336cf commit a35eb4e

File tree

8 files changed

+283
-5
lines changed

8 files changed

+283
-5
lines changed

src/backend/storage/file/fd.c

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1621,6 +1621,17 @@ PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
16211621
*/
16221622
fileFlags |= O_CLOEXEC;
16231623

1624+
#ifdef WIN32
1625+
1626+
/*
1627+
* Enable overlapped for all vfds on Windows. This and PG_O_DIRECT are
1628+
* required for vectored I/O. The pg_p{read,readv,write,writev}()
1629+
* emulation functions wait for completion if required, so they remain
1630+
* fully synchronous.
1631+
*/
1632+
fileFlags |= O_OVERLAPPED;
1633+
#endif
1634+
16241635
vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
16251636

16261637
if (vfdP->fd < 0)
@@ -2183,7 +2194,10 @@ FileReadV(File file, const struct iovec *iov, int iovcnt, off_t offset,
21832194

21842195
retry:
21852196
pgstat_report_wait_start(wait_event_info);
2186-
returnCode = pg_preadv(vfdP->fd, iov, iovcnt, offset);
2197+
if (vfdP->fileFlags & PG_O_DIRECT)
2198+
returnCode = pg_direct_preadv(vfdP->fd, iov, iovcnt, offset);
2199+
else
2200+
returnCode = pg_preadv(vfdP->fd, iov, iovcnt, offset);
21872201
pgstat_report_wait_end();
21882202

21892203
if (returnCode < 0)
@@ -2293,7 +2307,10 @@ FileWriteV(File file, const struct iovec *iov, int iovcnt, off_t offset,
22932307

22942308
retry:
22952309
pgstat_report_wait_start(wait_event_info);
2296-
returnCode = pg_pwritev(vfdP->fd, iov, iovcnt, offset);
2310+
if (vfdP->fileFlags & PG_O_DIRECT)
2311+
returnCode = pg_direct_pwritev(vfdP->fd, iov, iovcnt, offset);
2312+
else
2313+
returnCode = pg_pwritev(vfdP->fd, iov, iovcnt, offset);
22972314
pgstat_report_wait_end();
22982315

22992316
if (returnCode >= 0)

src/include/port.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -338,7 +338,8 @@ extern bool rmtree(const char *path, bool rmtopdir);
338338
* open() and fopen() replacements to allow deletion of open files and
339339
* passing of other special options.
340340
*/
341-
#define O_DIRECT 0x80000000
341+
#define O_OVERLAPPED 0x40000000
342+
#define O_DIRECT 0x80000000
342343
extern HANDLE pgwin32_open_handle(const char *, int, bool);
343344
extern int pgwin32_open(const char *, int,...);
344345
extern FILE *pgwin32_fopen(const char *, const char *);

src/include/port/pg_iovec.h

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,18 @@ struct iovec
4040
*/
4141
#define PG_IOV_MAX Min(IOV_MAX, 128)
4242

43+
/* Expose whether vectored I/O is emulated on this platform. */
44+
#if defined(WIN32)
45+
/* Windows only has it for O_DIRECT | O_OVERLAPPED. */
46+
#define pg_iov_emulated(direct) (!(direct))
47+
#elif HAVE_DECL_PREADV && HAVE_DECL_PWRITEV
48+
/* In practice this covers all modern Unixen. */
49+
#define pg_iov_emulated(direct) false
50+
#else
51+
/* A few historical late adopters, not likely in the field. */
52+
#define pg_iov_emulated(direct) true
53+
#endif
54+
4355
/*
4456
* Like preadv(), but with a prefix to remind us of a side-effect: on Windows
4557
* this changes the current file position.
@@ -118,4 +130,30 @@ pg_pwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset)
118130
#endif
119131
}
120132

133+
/*
134+
* Variants for callers that opened with O_DIRECT and (Windows only)
135+
* O_OVERLAPPED. This reaches real vectored I/O on Windows, but fails without
136+
* those flags.
137+
*/
138+
139+
static inline ssize_t
140+
pg_direct_preadv(int fd, const struct iovec *iov, int iovcnt, off_t offset)
141+
{
142+
#ifdef WIN32
143+
if (iovcnt > 1)
144+
return pg_win32_direct_preadv(fd, iov, iovcnt, offset);
145+
#endif
146+
return pg_preadv(fd, iov, iovcnt, offset);
147+
}
148+
149+
static inline ssize_t
150+
pg_direct_pwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset)
151+
{
152+
#ifdef WIN32
153+
if (iovcnt > 1)
154+
return pg_win32_direct_pwritev(fd, iov, iovcnt, offset);
155+
#endif
156+
return pg_pwritev(fd, iov, iovcnt, offset);
157+
}
158+
121159
#endif /* PG_IOVEC_H */

src/include/port/win32_port.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -585,8 +585,27 @@ typedef unsigned short mode_t;
585585

586586
/* in port/win32pread.c */
587587
extern ssize_t pg_pread(int fd, void *buf, size_t nbyte, off_t offset);
588+
extern ssize_t pg_win32_direct_preadv(int fd, struct iovec *iovec, int iovcnt, off_t offset);
588589

589590
/* in port/win32pwrite.c */
590591
extern ssize_t pg_pwrite(int fd, const void *buf, size_t nbyte, off_t offset);
592+
extern ssize_t pg_win32_direct_pwritev(int fd, struct iovec *iovec, int iovcnt, off_t offset);
593+
594+
/*
595+
* This should in theory be GetSystemInfo()'s dwPageSize, but it's
596+
* PG_IO_ALIGN_SIZE on all known modern systems. If it's ever wrong, direct
597+
* I/O operations will fail here and elsewhere, and you'll have to turn that
598+
* mode off, but that applies to every OS.
599+
*/
600+
#define PG_WIN32_FILE_SEGMENT_SIZE PG_IO_ALIGN_SIZE
601+
602+
/* Max segments expected for block-oriented I/O. */
603+
#define PG_WIN32_FILE_SEGMENTS_MAX (PG_IOV_MAX * (BLCKSZ / PG_IO_ALIGN_SIZE))
604+
605+
/* in port/win32common.c */
606+
extern DWORD pg_win32_iovec_to_file_segments(FILE_SEGMENT_ELEMENT * segments,
607+
int maxsegments,
608+
struct iovec *iov,
609+
int iovcnt);
591610

592611
#endif /* PG_WIN32_PORT_H */

src/port/open.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ pgwin32_open_handle(const char *fileName, int fileFlags, bool backup_semantics)
7575
assert((fileFlags & ((O_RDONLY | O_WRONLY | O_RDWR) | O_APPEND |
7676
(O_RANDOM | O_SEQUENTIAL | O_TEMPORARY) |
7777
_O_SHORT_LIVED | O_DSYNC | O_DIRECT |
78+
O_OVERLAPPED |
7879
(O_CREAT | O_TRUNC | O_EXCL) | (O_TEXT | O_BINARY))) == fileFlags);
7980

8081
sa.nLength = sizeof(sa);
@@ -95,6 +96,7 @@ pgwin32_open_handle(const char *fileName, int fileFlags, bool backup_semantics)
9596
((fileFlags & O_SEQUENTIAL) ? FILE_FLAG_SEQUENTIAL_SCAN : 0) |
9697
((fileFlags & _O_SHORT_LIVED) ? FILE_ATTRIBUTE_TEMPORARY : 0) |
9798
((fileFlags & O_TEMPORARY) ? FILE_FLAG_DELETE_ON_CLOSE : 0) |
99+
((fileFlags & O_OVERLAPPED) ? FILE_FLAG_OVERLAPPED : 0) |
98100
((fileFlags & O_DIRECT) ? FILE_FLAG_NO_BUFFERING : 0) |
99101
((fileFlags & O_DSYNC) ? FILE_FLAG_WRITE_THROUGH : 0),
100102
NULL)) == INVALID_HANDLE_VALUE)

src/port/win32common.c

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
#include "postgres.h"
2020
#endif
2121

22+
#include "port/pg_iovec.h"
23+
2224
/*
2325
* pgwin32_get_file_type
2426
*
@@ -62,3 +64,39 @@ pgwin32_get_file_type(HANDLE hFile)
6264

6365
return fileType;
6466
}
67+
68+
/*
69+
* Windows scatter/gather works with lists of raw page addresses, which this
70+
* function produces from Unix iovec format. All iovecs must be aligned to
71+
* PG_WIN32_FILE_SEGMENT_SIZE (but they always are in PostgreSQL when using
72+
* direct I/O, and vectored I/O is only available on Windows with direct I/O).
73+
* Returns zero on badly aligned or input that would exceed maxsegments.
74+
*/
75+
DWORD
76+
pg_win32_iovec_to_file_segments(FILE_SEGMENT_ELEMENT * segments,
77+
int maxsegments,
78+
struct iovec *iov,
79+
int iovcnt)
80+
{
81+
DWORD nsegments = 0;
82+
83+
for (int i = 0; i < iovcnt; ++i)
84+
{
85+
char *base = iov[i].iov_base;
86+
size_t len = iov[i].iov_len;
87+
88+
if (nsegments > maxsegments ||
89+
((intptr_t) base) % PG_WIN32_FILE_SEGMENT_SIZE != 0 ||
90+
len % PG_WIN32_FILE_SEGMENT_SIZE != 0)
91+
return 0;
92+
93+
while (len > 0)
94+
{
95+
segments[nsegments++].Buffer = base;
96+
base += PG_WIN32_FILE_SEGMENT_SIZE;
97+
len -= PG_WIN32_FILE_SEGMENT_SIZE;
98+
}
99+
}
100+
101+
return nsegments * PG_WIN32_FILE_SEGMENT_SIZE;
102+
}

src/port/win32pread.c

Lines changed: 86 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,22 @@
1313

1414

1515
#include "c.h"
16+
#include "port/pg_iovec.h"
1617

1718
#include <windows.h>
1819

20+
#ifndef FRONTEND
21+
static HANDLE
22+
completion_event(void)
23+
{
24+
static HANDLE h = NULL;
25+
26+
if (!h && !(h = CreateEvent(NULL, true, false, NULL)))
27+
errno = ENOMEM;
28+
return h;
29+
}
30+
#endif
31+
1932
ssize_t
2033
pg_pread(int fd, void *buf, size_t size, off_t offset)
2134
{
@@ -33,10 +46,81 @@ pg_pread(int fd, void *buf, size_t size, off_t offset)
3346
/* Avoid overflowing DWORD. */
3447
size = Min(size, 1024 * 1024 * 1024);
3548

36-
/* Note that this changes the file position, despite not using it. */
49+
/*
50+
* Note that this changes the file position despite not using it unless
51+
* opened with O_OVERLAPPED.
52+
*/
3753
overlapped.Offset = offset;
54+
55+
#ifndef FRONTEND
56+
/* If not synchronously handled from cache, we may need to wait. */
57+
if (unlikely(!(overlapped.hEvent = completion_event())))
58+
return -1;
59+
#endif
60+
3861
if (!ReadFile(handle, buf, size, &result, &overlapped))
3962
{
63+
#ifndef FRONTEND
64+
if (GetLastError() == ERROR_IO_PENDING &&
65+
GetOverlappedResult(overlapped.hEvent,
66+
&overlapped,
67+
&result,
68+
TRUE))
69+
return result;
70+
#endif
71+
72+
if (GetLastError() == ERROR_HANDLE_EOF)
73+
return 0;
74+
75+
_dosmaperr(GetLastError());
76+
return -1;
77+
}
78+
79+
return result;
80+
}
81+
82+
#ifndef FRONTEND
83+
/*
84+
* Special emulation of preadv() that works with O_DIRECT | O_OVERLAPPED.
85+
* See pg_iovec.h for general emulation.
86+
*/
87+
ssize_t
88+
pg_win32_direct_preadv(int fd, struct iovec *iov, int iovcnt, off_t offset)
89+
{
90+
FILE_SEGMENT_ELEMENT segments[PG_WIN32_FILE_SEGMENTS_MAX];
91+
OVERLAPPED overlapped = {0};
92+
HANDLE handle;
93+
DWORD size;
94+
DWORD result;
95+
96+
handle = (HANDLE) _get_osfhandle(fd);
97+
if (handle == INVALID_HANDLE_VALUE)
98+
{
99+
errno = EBADF;
100+
return -1;
101+
}
102+
103+
size = pg_win32_iovec_to_file_segments(segments, lengthof(segments),
104+
iov, iovcnt);
105+
if (size == 0)
106+
{
107+
errno = EINVAL;
108+
return -1;
109+
}
110+
111+
/* If not synchronously handled from cache, we may need to wait. */
112+
if (unlikely(!(overlapped.hEvent = completion_event())))
113+
return -1;
114+
115+
if (!ReadFileScatter(handle, segments, size, &result, &overlapped))
116+
{
117+
if (GetLastError() == ERROR_IO_PENDING &&
118+
GetOverlappedResult(overlapped.hEvent,
119+
&overlapped,
120+
&result,
121+
TRUE))
122+
return result;
123+
40124
if (GetLastError() == ERROR_HANDLE_EOF)
41125
return 0;
42126

@@ -46,3 +130,4 @@ pg_pread(int fd, void *buf, size_t size, off_t offset)
46130

47131
return result;
48132
}
133+
#endif

0 commit comments

Comments
 (0)