108108#include "replication/logicallauncher.h"
109109#include "replication/slotsync.h"
110110#include "replication/walsender.h"
111+ #include "storage/aio_subsys.h"
111112#include "storage/fd.h"
113+ #include "storage/io_worker.h"
112114#include "storage/ipc.h"
113115#include "storage/pmsignal.h"
116+ #include "storage/proc.h"
114117#include "tcop/backend_startup.h"
115118#include "tcop/tcopprot.h"
116119#include "utils/datetime.h"
@@ -340,6 +343,7 @@ typedef enum
340343 * ckpt */
341344 PM_WAIT_XLOG_ARCHIVAL , /* waiting for archiver and walsenders to
342345 * finish */
346+ PM_WAIT_IO_WORKERS , /* waiting for io workers to exit */
343347 PM_WAIT_CHECKPOINTER , /* waiting for checkpointer to shut down */
344348 PM_WAIT_DEAD_END , /* waiting for dead-end children to exit */
345349 PM_NO_CHILDREN , /* all important children have exited */
@@ -402,6 +406,10 @@ bool LoadedSSL = false;
402406static DNSServiceRef bonjour_sdref = NULL ;
403407#endif
404408
409+ /* State for IO worker management. */
410+ static int io_worker_count = 0 ;
411+ static PMChild * io_worker_children [MAX_IO_WORKERS ];
412+
405413/*
406414 * postmaster.c - function prototypes
407415 */
@@ -436,6 +444,8 @@ static void TerminateChildren(int signal);
436444static int CountChildren (BackendTypeMask targetMask );
437445static void LaunchMissingBackgroundProcesses (void );
438446static void maybe_start_bgworkers (void );
447+ static bool maybe_reap_io_worker (int pid );
448+ static void maybe_adjust_io_workers (void );
439449static bool CreateOptsFile (int argc , char * argv [], char * fullprogname );
440450static PMChild * StartChildProcess (BackendType type );
441451static void StartSysLogger (void );
@@ -1365,6 +1375,11 @@ PostmasterMain(int argc, char *argv[])
13651375 */
13661376 AddToDataDirLockFile (LOCK_FILE_LINE_PM_STATUS , PM_STATUS_STARTING );
13671377
1378+ UpdatePMState (PM_STARTUP );
1379+
1380+ /* Make sure we can perform I/O while starting up. */
1381+ maybe_adjust_io_workers ();
1382+
13681383 /* Start bgwriter and checkpointer so they can help with recovery */
13691384 if (CheckpointerPMChild == NULL )
13701385 CheckpointerPMChild = StartChildProcess (B_CHECKPOINTER );
@@ -1377,7 +1392,6 @@ PostmasterMain(int argc, char *argv[])
13771392 StartupPMChild = StartChildProcess (B_STARTUP );
13781393 Assert (StartupPMChild != NULL );
13791394 StartupStatus = STARTUP_RUNNING ;
1380- UpdatePMState (PM_STARTUP );
13811395
13821396 /* Some workers may be scheduled to start now */
13831397 maybe_start_bgworkers ();
@@ -2502,6 +2516,16 @@ process_pm_child_exit(void)
25022516 continue ;
25032517 }
25042518
2519+ /* Was it an IO worker? */
2520+ if (maybe_reap_io_worker (pid ))
2521+ {
2522+ if (!EXIT_STATUS_0 (exitstatus ) && !EXIT_STATUS_1 (exitstatus ))
2523+ HandleChildCrash (pid , exitstatus , _ ("io worker" ));
2524+
2525+ maybe_adjust_io_workers ();
2526+ continue ;
2527+ }
2528+
25052529 /*
25062530 * Was it a backend or a background worker?
25072531 */
@@ -2723,6 +2747,7 @@ HandleFatalError(QuitSignalReason reason, bool consider_sigabrt)
27232747 case PM_WAIT_XLOG_SHUTDOWN :
27242748 case PM_WAIT_XLOG_ARCHIVAL :
27252749 case PM_WAIT_CHECKPOINTER :
2750+ case PM_WAIT_IO_WORKERS :
27262751
27272752 /*
27282753 * NB: Similar code exists in PostmasterStateMachine()'s handling
@@ -2905,20 +2930,21 @@ PostmasterStateMachine(void)
29052930
29062931 /*
29072932 * If we are doing crash recovery or an immediate shutdown then we
2908- * expect archiver, checkpointer and walsender to exit as well,
2909- * otherwise not.
2933+ * expect archiver, checkpointer, io workers and walsender to exit as
2934+ * well, otherwise not.
29102935 */
29112936 if (FatalError || Shutdown >= ImmediateShutdown )
29122937 targetMask = btmask_add (targetMask ,
29132938 B_CHECKPOINTER ,
29142939 B_ARCHIVER ,
2940+ B_IO_WORKER ,
29152941 B_WAL_SENDER );
29162942
29172943 /*
2918- * Normally walsenders and archiver will continue running; they will
2919- * be terminated later after writing the checkpoint record. We also
2920- * let dead-end children to keep running for now. The syslogger
2921- * process exits last.
2944+ * Normally archiver, checkpointer, IO workers and walsenders will
2945+ * continue running; they will be terminated later after writing the
2946+ * checkpoint record. We also let dead-end children to keep running
2947+ * for now. The syslogger process exits last.
29222948 *
29232949 * This assertion checks that we have covered all backend types,
29242950 * either by including them in targetMask, or by noting here that they
@@ -2933,12 +2959,13 @@ PostmasterStateMachine(void)
29332959 B_LOGGER );
29342960
29352961 /*
2936- * Archiver, checkpointer and walsender may or may not be in
2937- * targetMask already.
2962+ * Archiver, checkpointer, IO workers, and walsender may or may
2963+ * not be in targetMask already.
29382964 */
29392965 remainMask = btmask_add (remainMask ,
29402966 B_ARCHIVER ,
29412967 B_CHECKPOINTER ,
2968+ B_IO_WORKER ,
29422969 B_WAL_SENDER );
29432970
29442971 /* these are not real postmaster children */
@@ -3039,11 +3066,25 @@ PostmasterStateMachine(void)
30393066 {
30403067 /*
30413068 * PM_WAIT_XLOG_ARCHIVAL state ends when there are no children other
3042- * than checkpointer, dead-end children and logger left. There
3069+ * than checkpointer, io workers and dead-end children left. There
30433070 * shouldn't be any regular backends left by now anyway; what we're
30443071 * really waiting for is for walsenders and archiver to exit.
30453072 */
3046- if (CountChildren (btmask_all_except (B_CHECKPOINTER , B_LOGGER , B_DEAD_END_BACKEND )) == 0 )
3073+ if (CountChildren (btmask_all_except (B_CHECKPOINTER , B_IO_WORKER ,
3074+ B_LOGGER , B_DEAD_END_BACKEND )) == 0 )
3075+ {
3076+ UpdatePMState (PM_WAIT_IO_WORKERS );
3077+ SignalChildren (SIGUSR2 , btmask (B_IO_WORKER ));
3078+ }
3079+ }
3080+
3081+ if (pmState == PM_WAIT_IO_WORKERS )
3082+ {
3083+ /*
3084+ * PM_WAIT_IO_WORKERS state ends when there's only checkpointer and
3085+ * dead_end children left.
3086+ */
3087+ if (io_worker_count == 0 )
30473088 {
30483089 UpdatePMState (PM_WAIT_CHECKPOINTER );
30493090
@@ -3171,10 +3212,14 @@ PostmasterStateMachine(void)
31713212 /* re-create shared memory and semaphores */
31723213 CreateSharedMemoryAndSemaphores ();
31733214
3215+ UpdatePMState (PM_STARTUP );
3216+
3217+ /* Make sure we can perform I/O while starting up. */
3218+ maybe_adjust_io_workers ();
3219+
31743220 StartupPMChild = StartChildProcess (B_STARTUP );
31753221 Assert (StartupPMChild != NULL );
31763222 StartupStatus = STARTUP_RUNNING ;
3177- UpdatePMState (PM_STARTUP );
31783223 /* crash recovery started, reset SIGKILL flag */
31793224 AbortStartTime = 0 ;
31803225
@@ -3198,6 +3243,7 @@ pmstate_name(PMState state)
31983243 PM_TOSTR_CASE (PM_WAIT_BACKENDS );
31993244 PM_TOSTR_CASE (PM_WAIT_XLOG_SHUTDOWN );
32003245 PM_TOSTR_CASE (PM_WAIT_XLOG_ARCHIVAL );
3246+ PM_TOSTR_CASE (PM_WAIT_IO_WORKERS );
32013247 PM_TOSTR_CASE (PM_WAIT_DEAD_END );
32023248 PM_TOSTR_CASE (PM_WAIT_CHECKPOINTER );
32033249 PM_TOSTR_CASE (PM_NO_CHILDREN );
@@ -3235,6 +3281,16 @@ LaunchMissingBackgroundProcesses(void)
32353281 if (SysLoggerPMChild == NULL && Logging_collector )
32363282 StartSysLogger ();
32373283
3284+ /*
3285+ * The number of configured workers might have changed, or a prior start
3286+ * of a worker might have failed. Check if we need to start/stop any
3287+ * workers.
3288+ *
3289+ * A config file change will always lead to this function being called, so
3290+ * we always will process the config change in a timely manner.
3291+ */
3292+ maybe_adjust_io_workers ();
3293+
32383294 /*
32393295 * The checkpointer and the background writer are active from the start,
32403296 * until shutdown is initiated.
@@ -4120,6 +4176,7 @@ bgworker_should_start_now(BgWorkerStartTime start_time)
41204176 case PM_WAIT_DEAD_END :
41214177 case PM_WAIT_XLOG_ARCHIVAL :
41224178 case PM_WAIT_XLOG_SHUTDOWN :
4179+ case PM_WAIT_IO_WORKERS :
41234180 case PM_WAIT_BACKENDS :
41244181 case PM_STOP_BACKENDS :
41254182 break ;
@@ -4270,6 +4327,99 @@ maybe_start_bgworkers(void)
42704327 }
42714328}
42724329
4330+ static bool
4331+ maybe_reap_io_worker (int pid )
4332+ {
4333+ for (int id = 0 ; id < MAX_IO_WORKERS ; ++ id )
4334+ {
4335+ if (io_worker_children [id ] &&
4336+ io_worker_children [id ]-> pid == pid )
4337+ {
4338+ ReleasePostmasterChildSlot (io_worker_children [id ]);
4339+
4340+ -- io_worker_count ;
4341+ io_worker_children [id ] = NULL ;
4342+ return true;
4343+ }
4344+ }
4345+ return false;
4346+ }
4347+
4348+ /*
4349+ * Start or stop IO workers, to close the gap between the number of running
4350+ * workers and the number of configured workers. Used to respond to change of
4351+ * the io_workers GUC (by increasing and decreasing the number of workers), as
4352+ * well as workers terminating in response to errors (by starting
4353+ * "replacement" workers).
4354+ */
4355+ static void
4356+ maybe_adjust_io_workers (void )
4357+ {
4358+ if (!pgaio_workers_enabled ())
4359+ return ;
4360+
4361+ /*
4362+ * If we're in final shutting down state, then we're just waiting for all
4363+ * processes to exit.
4364+ */
4365+ if (pmState >= PM_WAIT_IO_WORKERS )
4366+ return ;
4367+
4368+ /* Don't start new workers during an immediate shutdown either. */
4369+ if (Shutdown >= ImmediateShutdown )
4370+ return ;
4371+
4372+ /*
4373+ * Don't start new workers if we're in the shutdown phase of a crash
4374+ * restart. But we *do* need to start if we're already starting up again.
4375+ */
4376+ if (FatalError && pmState >= PM_STOP_BACKENDS )
4377+ return ;
4378+
4379+ Assert (pmState < PM_WAIT_IO_WORKERS );
4380+
4381+ /* Not enough running? */
4382+ while (io_worker_count < io_workers )
4383+ {
4384+ PMChild * child ;
4385+ int id ;
4386+
4387+ /* find unused entry in io_worker_children array */
4388+ for (id = 0 ; id < MAX_IO_WORKERS ; ++ id )
4389+ {
4390+ if (io_worker_children [id ] == NULL )
4391+ break ;
4392+ }
4393+ if (id == MAX_IO_WORKERS )
4394+ elog (ERROR , "could not find a free IO worker ID" );
4395+
4396+ /* Try to launch one. */
4397+ child = StartChildProcess (B_IO_WORKER );
4398+ if (child != NULL )
4399+ {
4400+ io_worker_children [id ] = child ;
4401+ ++ io_worker_count ;
4402+ }
4403+ else
4404+ break ; /* XXX try again soon? */
4405+ }
4406+
4407+ /* Too many running? */
4408+ if (io_worker_count > io_workers )
4409+ {
4410+ /* ask the IO worker in the highest slot to exit */
4411+ for (int id = MAX_IO_WORKERS - 1 ; id >= 0 ; -- id )
4412+ {
4413+ if (io_worker_children [id ] != NULL )
4414+ {
4415+ kill (io_worker_children [id ]-> pid , SIGUSR2 );
4416+ break ;
4417+ }
4418+ }
4419+ }
4420+ }
4421+
4422+
42734423/*
42744424 * When a backend asks to be notified about worker state changes, we
42754425 * set a flag in its backend entry. The background worker machinery needs
0 commit comments