-
Notifications
You must be signed in to change notification settings - Fork 3.7k
/
log0write.cc
2861 lines (2122 loc) · 98.7 KB
/
log0write.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*****************************************************************************
Copyright (c) 1995, 2023, Oracle and/or its affiliates.
Copyright (c) 2009, Google Inc.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2.0,
as published by the Free Software Foundation.
This program is also distributed with certain software (including
but not limited to OpenSSL) that is licensed under separate terms,
as designated in a particular file or component or in included license
documentation. The authors of MySQL hereby grant you an additional
permission to link the program and your derivative works with the
separately licensed software that they have included with MySQL.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License, version 2.0, for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*****************************************************************************/
/**************************************************/ /**
@file log/log0write.cc
Redo log writing and flushing, including functions for:
1. Waiting for the log written / flushed up to provided lsn.
2. Redo log write threads: log_writer, log_flusher,
log_write_notifier, log_flush_notifier.
@author Paweł Olchawa
*******************************************************/
#ifndef UNIV_HOTBACKUP
/* std::memory_order_* */
#include <atomic>
/* thd_wait_begin() / thd_wait_end() */
#include <mysql/service_thd_wait.h>
/* std::memcpy, std::memset */
#include <cstring>
/* arch_log_sys */
#include "arch0arch.h"
/* page_id_t */
#include "buf0types.h"
/* log_update_buf_limit, ... */
#include "log0buf.h"
/* log_request_checkpoint */
#include "log0chkp.h"
/* Log_files_capacity::soft_logical_capacity, ... */
#include "log0files_capacity.h"
/* log_files_produce_file, ... */
#include "log0files_governor.h"
/* log_limits_mutex_enter, ... */
#include "log0log.h"
/* redo_log_archive_produce */
#include "log0meb.h"
/* recv_no_ibuf_operations */
#include "log0recv.h"
/* log_t::X */
#include "log0sys.h"
/* log_sync_point */
#include "log0test.h"
/* Log_file::offset, OS_FILE_LOG_BLOCK_SIZE */
#include "log0types.h"
/* log_writer_mutex */
#include "log0write.h"
/* create_internal_thd, destroy_internal_thd */
#include "sql/sql_thd_internal_api.h"
/* MONITOR_INC, ... */
#include "srv0mon.h"
/* srv_read_only_mode */
#include "srv0srv.h"
/* ut_uint64_align_down */
#include "ut0byte.h"
// clang-format off
/**************************************************/ /**
@page PAGE_INNODB_REDO_LOG_THREADS Background redo log threads
Three background log threads are responsible for writes of new data to disk:
-# [Log writer](@ref sect_redo_log_writer) - writes from the log buffer or
write-ahead buffer to OS buffers.
-# [Log flusher](@ref sect_redo_log_flusher) - writes from OS buffers to disk
(fsyncs).
-# [Log write_notifier](@ref sect_redo_log_write_notifier) - notifies user
threads about completed writes to disk (when write_lsn is advanced).
-# [Log flush_notifier](@ref sect_redo_log_flush_notifier) - notifies user
threads about completed fsyncs (when flushed_to_disk_lsn is advanced).
Two background log threads are responsible for checkpoints (reclaiming space
in log files):
-# [Log checkpointer](@ref sect_redo_log_checkpointer) - determines
@ref subsect_redo_log_available_for_checkpoint_lsn and writes checkpoints.
@section sect_redo_log_writer Thread: log writer
This thread is responsible for writing data from the log buffer to disk
(to the log files). However, it's not responsible for doing fsync() calls.
It copies data to system buffers. It is the log flusher thread, which is
responsible for doing fsync().
There are following points that need to be addressed by the log writer thread:
-# %Find out how much data is ready in the log buffer, which is concurrently
filled in by multiple user threads.
In the log recent written buffer, user threads set links for every finished
write to the log buffer. Each such link is represented as a number of bytes
written, starting from a _start_lsn_. The link is stored in the slot assigned
to the _start_lsn_ of the write.
The log writer thread tracks links in the recent written buffer, traversing
a connected path created by the links. It stops when it encounters a missing
outgoing link. In such case the next fragment of the log buffer is still
being written (or the maximum assigned lsn was reached).
It also stops as soon as it has traversed by more than 4kB, in which case
it is enough for a next write (unless we decided again to do fsyncs from
inside the log writer thread). After traversing links and clearing slots
occupied by the links (in the recent written buffer), the log writer thread
updates @ref subsect_redo_log_buf_ready_for_write_lsn.
@diafile storage/innobase/log/recent_written_buffer.dia "Example of links in the recent written buffer"
@note The log buffer has no holes up to the _log.buf_ready_for_write_lsn_
(all concurrent writes for smaller lsn have been finished).
If there were no links to traverse, _log.buf_ready_for_write_lsn_ was not
advanced and the log writer thread needs to wait. In such case it first
uses spin delay and afterwards switches to wait on the _writer_event_.
-# Prepare log blocks for writing - update their headers and footers.
The log writer thread detects completed log blocks in the log buffer.
Such log blocks will not receive any more writes. Hence their headers
and footers could be easily updated (e.g. checksum is calculated).
@diafile storage/innobase/log/log_writer_complete_blocks.dia "Complete blocks are detected and written"
If any complete blocks were detected, they are written directly from
the log buffer (after updating headers and footers). Afterwards the
log writer thread retries the previous step before making next decisions.
For each write consisting of one or more complete blocks, the
_MONITOR_LOG_FULL_BLOCK_WRITES_ is incremented by one.
@note There is a special case - when write-ahead is required, data needs
to be copied to the write-ahead buffer and the last incomplete block could
also be copied and written. For details read below and check the next point.
The special case is also for the last, incomplete log block. Note that
@ref subsect_redo_log_buf_ready_for_write_lsn could be in the middle of
such block. In such case, next writes are likely incoming to the log block.
@diafile storage/innobase/log/log_writer_incomplete_block.dia "Incomplete block is copied"
For performance reasons we often need to write the last incomplete block.
That's because it turned out, that we should try to reclaim user threads
as soon as possible, allowing them to handle next transactions and provide
next data.
In such case:
- the last log block is first copied to the dedicated buffer, up to the
@ref subsect_redo_log_buf_ready_for_write_lsn,
- the remaining part of the block in the dedicated buffer is filled in
with _0x00_ bytes,
- header fields are updated,
- checksum is calculated and stored in the block's footer,
- the block is written from the dedicated buffer,
- the _MONITOR_LOG_PARTIAL_BLOCK_WRITES_ is incremented by one.
@note The write-ahead buffer is used as the dedicated buffer for writes
of the last incomplete block. That's because, whenever we needed a next
write-ahead (even for complete blocks), we possibly can also write the
last incomplete block during the write-ahead. The monitor counters for
full/partial block writes are incremented before the logic related to
writing ahead is applied. Hence the counter of partial block writes is
not incremented if a full block write was possible (in which case only
requirement for write-ahead could be the reason of writing the incomplete
block).
@remarks The log writer thread never updates
[first_rec_group](@ref a_redo_log_block_first_rec_group) field.
It has to be set by user threads before the block is allowed to be written.
That's because only user threads know where are the boundaries between
groups of log records. The user thread which has written data ending at
lsn which needs to be pointed as _first_rec_group_, is the one responsible
for setting the field. User thread which has written exactly up to the end
of log block, is considered ending at lsn after the header of the next log
block. That's because after such write, the log writer is allowed to write
the next empty log block (_buf_ready_for_write_lsn_ points then to such lsn).
The _first_rec_group_ field is updated before the link is added to the log
recent written buffer.
-# Avoid read-on-write issue.
The log writer thread is also responsible for writing ahead to avoid
the read-on-write problem. It tracks up to which point the write ahead
has been done. When a write would go further:
- If we were trying to write more than size of single write-ahead
region, we limit the write to completed write-ahead sized regions,
and postpone writing the last fragment for later (retrying with the
first step and updating the _buf_ready_for_write_lsn_).
@note If we needed to write complete regions of write-ahead bytes,
they are ready in the log buffer and could be written directly from
there. Such writes would not cause read-on-write problem, because
size of the writes is divisible by write-ahead region.
- Else, we copy data to special write-ahead buffer, from which
we could safely write the whole single write-ahead sized region.
After copying the data, the write-ahead buffer is completed with
_0x00_ bytes.
@note The write-ahead buffer is also used for copying the last
incomplete log block, which was described in the previous point.
-# Update write_lsn.
After doing single write (single log_data_blocks_write()), the log writer
thread updates @ref subsect_redo_log_write_lsn and fallbacks to its main
loop. That's because a lot more data could be prepared in meantime, as
the write operation could take significant time.
That's why the general rule is that after doing log_data_blocks_write(),
we need to update @ref subsect_redo_log_buf_ready_for_write_lsn before
making next decisions on how much to write within next such call.
-# Notify [log writer_notifier thread](@ref sect_redo_log_write_notifier)
using os_event_set on the _write_notifier_event_.
@see @ref sect_redo_log_waiting_for_writer
-# Notify [log flusher thread](@ref sect_redo_log_flusher) using os_event_set()
on the _flusher_event_.
@section sect_redo_log_flusher Thread: log flusher
The log flusher thread is responsible for doing fsync() of the log files.
When the fsync() calls are finished, the log flusher thread updates the
@ref subsect_redo_log_flushed_to_disk_lsn and notifies the
[log flush_notifier thread](@ref sect_redo_log_flush_notifier) using
os_event_set() on the _flush_notifier_event_.
@remarks
Small optimization has been applied - if there was only a single log block
flushed since the previous flush, then the log flusher thread notifies user
threads directly (instead of notifying the log flush_notifier thread).
Impact of the optimization turned out to be positive for some scenarios and
negative for other, so further investigation is required. However, because
the change seems to make sense from logical point of view, it has been
preserved.
If the log flusher thread detects that none of the conditions is satisfied,
it simply waits and retries the checks. After initial spin delay, it waits
on the _flusher_event_.
@section sect_redo_log_flush_notifier Thread: log flush_notifier
The log flush_notifier thread is responsible for notifying all user threads
that are waiting for @ref subsect_redo_log_flushed_to_disk_lsn >= lsn, when
the condition is satisfied.
@remarks
It also notifies when it is very likely to be satisfied (lsn values are
within the same log block). It is allowed to make mistakes and it is
responsibility of the notified user threads to ensure, that
the _flushed_to_disk_lsn_ is advanced sufficiently.
The log flush_notifier thread waits for the advanced _flushed_to_disk_lsn_
in loop, using os_event_wait_time_low() on the _flush_notifier_event_.
When it gets notified by the [log flusher](@ref sect_redo_log_flusher),
it ensures that the _flushed_to_disk_lsn_ has been advanced (single new
byte is enough though).
It notifies user threads waiting on all events between (inclusive):
- event for a block with the previous value of _flushed_to_disk_lsn_,
- event for a block containing the new value of _flushed_to_disk_lsn_.
Events are assigned per blocks in the circular array of events using mapping:
event_slot = (lsn-1) / OS_FILE_LOG_BLOCK_SIZE % S
where S is size of the array (number of slots with events). Each slot has
single event, which groups all user threads waiting for flush up to any lsn
within the same log block (or log block with number greater by S*i).
@diafile storage/innobase/log/log_notifier_notifications.dia "Notifications executed on slots"
Internal mutex in event is used, to avoid missed notifications (these would
be worse than the false notifications).
However, there is also maximum timeout defined for the waiting on the event.
After the timeout was reached (default: 1ms), the _flushed_to_disk_lsn_ is
re-checked in the user thread (just in case).
@note Because flushes are possible for @ref subsect_redo_log_write_lsn set in
the middle of log block, it is likely that the same slot for the same block
will be notified multiple times in a row. We tried delaying notifications for
the last block, but the results were only worse then. It turned out that
latency is extremely important here.
@see @ref sect_redo_log_waiting_for_flusher
@section sect_redo_log_write_notifier Thread: log write_notifier
The log write_notifier thread is responsible for notifying all user threads
that are waiting for @ref subsect_redo_log_write_lsn >= lsn, when the condition
is satisfied.
@remarks
It also notifies when it is very likely to be satisfied (lsn values are
within the same log block). It is allowed to make mistakes and it is
responsibility of the notified user threads to ensure, that the _write_lsn_
is advanced sufficiently.
The log write_notifier thread waits for the advanced _write_lsn_ in loop,
using os_event_wait_time_low() on the _write_notifier_event_.
When it gets notified (by the [log writer](@ref sect_redo_log_writer)),
it ensures that the _write_lsn_ has been advanced (single new byte is enough).
Then it notifies user threads waiting on all events between (inclusive):
- event for a block with the previous value of _write_lsn_,
- event for a block containing the new value of _write_lsn_.
Events are assigned per blocks in the circular array of events using mapping:
event_slot = (lsn-1) / OS_FILE_LOG_BLOCK_SIZE % S
where S is size of the array (number of slots with events). Each slot has
single event, which groups all user threads waiting for write up to any lsn
within the same log block (or log block with number greater by S*i).
Internal mutex in event is used, to avoid missed notifications (these would
be worse than the false notifications).
However, there is also maximum timeout defined for the waiting on the event.
After the timeout was reached (default: 1ms), the _write_lsn_ is re-checked
in the user thread (just in case).
@note Because writes are possible for @ref subsect_redo_log_write_lsn set in
the middle of log block, it is likely that the same slot for the same block
will be notified multiple times in a row.
@see @ref sect_redo_log_waiting_for_writer
@section sect_redo_log_checkpointer Thread: log checkpointer
The log checkpointer thread is responsible for:
-# Checking if a checkpoint write is required (to decrease checkpoint age
before it gets too big).
-# Checking if synchronous flush of dirty pages should be forced on page
cleaner threads, because of space in redo log or age of the oldest page.
-# Writing checkpoints (it's the only thread allowed to do it!).
This thread has been introduced at the very end. It was not required for
the performance, but it makes the design more consistent after we have
introduced other log threads. That's because user threads are not doing
any writes to the log files themselves then. Previously they were writing
checkpoints when needed, which required synchronization between them.
The log checkpointer thread updates log.available_for_checkpoint_lsn,
which is calculated as:
min(log.buf_dirty_pages_added_up_to_lsn, max(0, oldest_lsn - L))
where:
- oldest_lsn = min(oldest modification of the earliest page from each
flush list),
- L is a number of slots in the log recent closed buffer.
The special case is when there is no dirty page in flush lists - then it's
basically set to the _log.buf_dirty_pages_added_up_to_lsn_.
@note Note that previously, all user threads were trying to calculate this
lsn concurrently, causing contention on flush_list mutex, which is required
to read the _oldest_modification_ of the earliest added page. Now the lsn
is updated in single thread.
@section sect_redo_log_waiting_for_writer Waiting until log has been written to
disk
User has to wait until the [log writer thread](@ref sect_redo_log_writer)
has written data from the log buffer to disk for lsn >= _end_lsn_ of log range
used by the user, which is true when:
write_lsn >= end_lsn
The @ref subsect_redo_log_write_lsn is updated by the log writer thread.
The waiting is solved using array of events. The user thread waiting for
a given lsn, waits using the event at position:
slot = (end_lsn - 1) / OS_FILE_LOG_BLOCK_SIZE % S
where _S_ is number of entries in the array. Therefore the event corresponds
to log block which contains the _end_lsn_.
The [log write_notifier thread](@ref sect_redo_log_write_notifier) tracks how
the @ref subsect_redo_log_write_lsn is advanced and notifies user threads for
consecutive slots.
@remarks
When the _write_lsn_ is in the middle of log block, all user threads waiting
for lsn values within the whole block are notified. When user thread is
notified, it checks if the current value of the _write_lsn_ is sufficient and
retries waiting if not. To avoid missed notifications, event's internal mutex
is used.
@section sect_redo_log_waiting_for_flusher Waiting until log has been flushed
to disk
If a user need to assure the log persistence in case of crash (e.g. on COMMIT
of a transaction), he has to wait until [log flusher](@ref
sect_redo_log_flusher) has flushed log files to disk for lsn >= _end_lsn_ of
log range used by the user, which is true when:
flushed_to_disk_lsn >= end_lsn
The @ref subsect_redo_log_flushed_to_disk_lsn is updated by the log flusher
thread.
The waiting is solved using array of events. The user thread waiting for
a given lsn, waits using the event at position:
slot = (end_lsn - 1) / OS_FILE_LOG_BLOCK_SIZE % S
where _S_ is number of entries in the array. Therefore the event corresponds
to log block which contains the _end_lsn_.
The [log flush_notifier thread](@ref sect_redo_log_flush_notifier) tracks how
the
@ref subsect_redo_log_flushed_to_disk_lsn is advanced and notifies user
threads for consecutive slots.
@remarks
When the _flushed_to_disk_lsn_ is in the middle of log block, all
user threads waiting for lsn values within the whole block are notified.
When user thread is notified, it checks if the current value of the
_flushed_to_disk_lsn_ is sufficient and retries waiting if not.
To avoid missed notifications, event's internal mutex is used.
@page PAGE_INNODB_REDO_LOG_FORMAT Format of redo log
@section sect_redo_log_format_overview Overview
Redo log contains multiple log files, each has the same format. Consecutive
files have data for consecutive ranges of lsn values. When a file ends at
_end_lsn_, the next log file begins at the _end_lsn_. There is a fixed number
of log files, they are re-used in circular manner. That is, for the last
log file, the first log file is a successor.
@note A single big file would remain fully cached for some of file systems,
even if only a small fragment of the file is being modified. Hence multiple
log files are used to make evictions always possible. Keep in mind though
that log files are used in circular manner (lsn modulo size of redo log files,
when size is calculated except the log file headers).
The log file names are: _#ib_redo0_, _#ib_redo1_, ... and they are stored in
subdirectory #innodb_redo, which is located inside the directory specified by
the innodb_log_group_home_dir (or in the datadir if not specified).
Whenever a new log file is being created, it is created first with the _tmp
suffix in its name. When the file is prepared, it becomes renamed (the suffix
is removed from the name).
When a new data directory is being initialized, all log files that are being
created, have LOG_HEADER_FLAG_NOT_INITIALIZED flag enabled in the log_flags
field in the header. After the data directory is initialized, this flag is
disabled (file header is re-flushed for the newest log file then).
File header contains the log_uuid field. It is a randomly chosen value when
the data directory is being initialized. It is used to detect situation,
in which user mixed log files from different data directories.
File header contains also start_lsn - this is start_lsn of the first log block
within that file.
@section sect_redo_log_format_file Log file format
@subsection subsect_redo_log_format_header Header of log file
%Log file starts with a header of _LOG_FILE_HDR_SIZE_ bytes. It contains:
- Initial block of _OS_FILE_LOG_BLOCK_SIZE_ (512) bytes, which has:
- Binding of an offset within the file to the lsn value.
This binding allows to map any lsn value which is represented
within the file to corresponding lsn value.
- Format of redo log - remains the same as before the patch.
- Checksum of the block.
- Two checkpoint blocks - _LOG_CHECKPOINT_1_ and _LOG_CHECKPOINT_2_.
Each checkpoint block contains _OS_FILE_LOG_BLOCK_SIZE_ bytes:
- _checkpoint_lsn_ - lsn to start recovery at.
@note In earlier versions than 8.0, checkpoint_lsn pointed
directly to the beginning of the first log record group,
which should be recovered (but still the related page could
have been flushed). However since 8.0 this value might point
to some byte inside a log record. In such case, recovery is
supposed to skip the group of log records which contains
the checkpoint lsn (and start at the beginning of the next).
We cannot easily determine beginning of the next group.
There are two cases:
- block with _checkpoint_lsn_ has no beginning of group at all
(first_rec_group = 0) - then we search forward for the first
block that has non-zero first_rec_group and there we have
the next group's start,
- block with _checkpoint_lsn_ has one or more groups of records
starting inside the block - then we start parsing at the first
group that starts in the block and keep parsing consecutive
groups until we passed checkpoint_lsn; we don't apply these
groups of records (we must not because of fil renames); after
we passed checkpoint_lsn, the next group that starts is the
one we were looking for to start recovery at; it is possible
that the next group begins in the next block (if there was no
more groups starting after checkpoint_lsn within the block)
- _checkpoint_no_ - checkpoint number - when checkpoint is
being written, a next checkpoint number is assigned.
- _log.buf_size_ - size of the log buffer when the checkpoint
write was started.
It remains a mystery, why do we need that. It's neither used
by the recovery, nor required for MEB. Some rumours say that
maybe it could be useful for auto-config external tools to
detect what configuration of MySQL should be used.
@note
Note that size of the log buffer could be decreased in runtime,
after writing the checkpoint (which was not the case, when this
field was being introduced).
There are two checkpoint headers, because they are updated alternately.
In case of crash in the middle of any such update, the alternate header
would remain valid (so it's the same reason for which double write buffer
is used for pages).
@remarks
Each log file has its own header. Checkpoints defined in checkpoint headers
always refer to LSN values within that file. During the recovery one should
find the file with the newest checkpoint.
@subsection subsect_redo_log_format_blocks Log blocks
After the header, there are consecutive log blocks. Each log block has the same
format and consists of _OS_FILE_LOG_BLOCK_SIZE_ bytes (512). These bytes are
enumerated by lsn values.
@note Bytes used by [headers of log files](@ref subsect_redo_log_format_header)
are NOT included in lsn sequence.
Each log block contains:
- header - _LOG_BLOCK_HDR_SIZE_ bytes (12):
- @anchor a_redo_log_block_hdr_no hdr_no
This is a block number. Consecutive blocks have consecutive numbers.
Hence this is basically lsn divided by _OS_FILE_LOG_BLOCK_SIZE_.
However it is also wrapped at 1G (due to limited size of the field).
It should be possible to wrap it at 2G (only the single flush bit is
reserved as the highest bit) but for historical reasons it is 1G.
- @anchor a_redo_log_block_data_len data_len
Number of bytes within the log block. Possible values:
- _0_ - this is an empty block (end the recovery).
- _OS_FILE_LOG_BLOCK_SIZE_ - this is a full block.
- value within [_LOG_BLOCK_HDR_SIZE_,
_OS_FILE_LOG_BLOCK_SIZE_ - _LOG_BLOCK_TRL_SIZE_),
which means that this is the last block and it is an
incomplete block.
This could be then considered an offset, which points
to the end of the data within the block. This value
includes _LOG_BLOCK_HDR_SIZE_ bytes of the header.
- @anchor a_redo_log_block_first_rec_group first_rec_group
Offset within the log block to the beginning of the first group
of log records that starts within the block or 0 if none starts.
This offset includes _LOG_BLOCK_HDR_SIZE_ bytes of the header.
- @anchor a_redo_log_block_epoch_no epoch_no
Log epoch number. Set by the log writer thread just before a write
starts for the block. For details @see LOG_BLOCK_HDR_EPOCH_NO.
It could be used during recovery to detect that we have read
old block of redo log (tail) because of the wrapped log files.
- data part - bytes up to [data_len](@ref a_redo_log_block_data_len) byte.
Actual data bytes are followed by _0x00_ if the block is incomplete.
@note Bytes within this fragment of the block, are enumerated by _sn_
sequence (whereas bytes of header and trailer are NOT). This is the
only difference between _sn_ and _lsn_ sequences (_lsn_ enumerates
also bytes of header and trailer).
- trailer - _LOG_BLOCK_TRL_SIZE_ bytes (4):
- checksum
Algorithm used for the checksum depends on the configuration.
Note that there is a potential problem if a crash happened just
after switching to "checksums enabled". During recovery some log
blocks would have checksum = LOG_NO_CHECKSUM_MAGIC and some would
have a valid checksum. Then recovery with enabled checksums would
point problems for the blocks without valid checksum. User would
have to disable checksums for the recovery then.
@remarks
All fields except [first_rec_group](@ref a_redo_log_block_first_rec_group)
are updated by the [log writer thread](@ref sect_redo_log_writer) just before
writing the block.
*******************************************************/
// clang-format on
/** Writes a given fragment of the log buffer to the current redo log file,
unless the file is full, in which case a new file is produced and function
exits (note, that the new log file's header is flushed in such case).
After data to the current log file has been written (log_data_blocks_write()),
the log.write_lsn is advanced accordingly to the number of written bytes,
which might be smaller than the requested number of bytes to write.
That's because this function exits after doing a single write operation.
That's because it might make sense to advance the lsn up to which data
is ready in the log buffer (for writing), before making decision about
next write (e.g. then the next write could be done for full blocks only).
@param[in] log redo log
@param[in] buffer the beginning of first log block to write
@param[in] buffer_size number of bytes to write since 'buffer'
@param[in] start_lsn lsn corresponding to first block start
@return DB_SUCCESS or error */
static dberr_t log_write_buffer(log_t &log, byte *buffer, size_t buffer_size,
lsn_t start_lsn);
/** Called when the redo log writer enters the extra_margin.
Requirement: log.writer_mutex acquired and log.m_write_inside_extra_margin
being false, before calling this function.
@param[in,out] log redo log */
static void log_writer_enter_extra_margin(log_t &log);
/** Called when the redo log writer exits the extra_margin.
Requirement: log.writer_mutex acquired and log.m_write_inside_extra_margin
being true, before calling this function.
@param[in,out] log redo log */
static void log_writer_exit_extra_margin(log_t &log);
/* Waits until there is free space in log files for log_writer to proceed.
@param[in] log redo log
@param[in] last_write_lsn previous log.write_lsn
@param[in] next_write_lsn next log.write_lsn
@return lsn up to which possible write is limited */
static lsn_t log_writer_wait_on_checkpoint(log_t &log, lsn_t last_write_lsn,
lsn_t next_write_lsn);
/* Waits until the archiver has archived enough for log_writer to proceed
or until the archiver becomes aborted.
@param[in] log redo log
@param[in] next_write_lsn next log.write_lsn */
static void log_writer_wait_on_archiver(log_t &log, lsn_t next_write_lsn);
/** Called after a write to the redo log file failed. If the reason was not
related to missing free space or busy file-lock, emits fatal error.
@param[in,out] log redo log
@param[in] err error code (non-zero) */
static void log_writer_write_failed(log_t &log, dberr_t err);
/** Writes fragment of the log buffer, not further than up to provided lsn.
Stops after the first call to log_data_blocks_write() or after producing
a new log file. If some data was written, the log.write_lsn is advanced.
For more details see @see log_write_buffer().
@param[in] log redo log
@param[in] next_write_lsn write up to this lsn value */
static void log_writer_write_buffer(log_t &log, lsn_t next_write_lsn);
/** Executes a synchronous flush of the log files (doing fsyncs).
Advances log.flushed_to_disk_lsn and notifies log flush_notifier thread.
Note: if only a single log block was flushed to disk, user threads
waiting for lsns within the block are notified directly from here,
and log flush_notifier thread is not notified! (optimization)
@param[in,out] log redo log */
static void log_flush_low(log_t &log);
/**************************************************/ /**
@name Waiting for redo log written or flushed up to lsn
*******************************************************/
/** @{ */
/** Computes index of a slot (in array of "wait events"), which should
be used when waiting until redo reached provided lsn.
@param[in] lsn lsn up to which waiting takes place
@param[in] events_n size of the array (number of slots)
@return index of the slot (integer in range 0 .. events_n-1) */
static inline size_t log_compute_wait_event_slot(lsn_t lsn, size_t events_n) {
/* We subtract one from lsn, because it is better to assign right boundary
of a log block to the slot representing the given block. If write or flush
happens within block, all threads interested in some lsn in that block should
be notified.
Suppose lsn % 512 == 0 (this is the only case for which subtracting 1 makes
any difference here). All threads waiting for some lsn in (lsn-1)/512 must
be notified anyway (previous lsn was smaller so the block wasn't closed yet).
On the other hand, it is useless to notify threads waiting for lsn values
within lsn / 512, because these are larger lsn values, except threads which
are waiting exactly at this lsn. That's why this group of threads it's better
to move to the slot corresponding to (lsn-1)/512 and then we could avoid
waking up those in lsn/512. Note that this scenario (lsn % 512 == 0) happens
often because our strategy is to prefer writes of full log blocks only,
leaving the incomplete last block for next write (unless there are no full
blocks). */
return ((lsn - 1) / OS_FILE_LOG_BLOCK_SIZE) & (events_n - 1);
}
/** Computes index of a slot (in array of "wait events"), which should
be used when waiting in log.write_events (for redo written up to lsn).
@param[in] log redo log
@param[in] lsn lsn up to which waiting (for log.write_lsn)
@return index of the slot (integer in range 0 .. log.write_events_size-1) */
static inline size_t log_compute_write_event_slot(const log_t &log, lsn_t lsn) {
return log_compute_wait_event_slot(lsn, log.write_events_size);
}
/** Computes index of a slot (in array of "wait events"), which should
be used when waiting in log.flush_events (for redo flushed up to lsn).
@param[in] log redo log
@param[in] lsn lsn up to which waiting (for log.flushed_to_disk_lsn)
@return index of the slot (integer in range 0 .. log.flush_events_size-1) */
static inline size_t log_compute_flush_event_slot(const log_t &log, lsn_t lsn) {
return log_compute_wait_event_slot(lsn, log.flush_events_size);
}
/** Computes maximum number of spin rounds which should be used when waiting
in user thread (for written or flushed redo) or 0 if busy waiting should not
be used at all.
@param[in] min_non_zero_value minimum allowed value (unless 0 is returned)
@return maximum number of spin rounds or 0 */
static inline uint64_t log_max_spins_when_waiting_in_user_thread(
uint64_t min_non_zero_value) {
uint64_t max_spins;
/* Get current cpu usage. */
const double cpu = srv_cpu_usage.utime_pct;
/* Get high-watermark - when cpu usage is higher, don't spin! */
const uint32_t hwm = srv_log_spin_cpu_pct_hwm;
if (srv_cpu_usage.utime_abs < srv_log_spin_cpu_abs_lwm || cpu >= hwm) {
/* Don't spin because either cpu usage is too high or it's
almost idle so no reason to bother. */
max_spins = 0;
} else if (cpu >= hwm / 2) {
/* When cpu usage is more than 50% of the hwm, use the minimum allowed
number of spin rounds, not to increase cpu usage too much (risky). */
max_spins = min_non_zero_value;
} else {
/* When cpu usage is less than 50% of the hwm, choose maximum spin rounds
in range [minimum, 10*minimum]. Smaller usage of cpu is, more spin rounds
might be used. */
const double r = 1.0 * (hwm / 2 - cpu) / (hwm / 2);
max_spins =
static_cast<uint64_t>(min_non_zero_value + r * min_non_zero_value * 9);
}
return max_spins;
}
/** Waits until redo log is written up to provided lsn (or greater).
We do not care if it's flushed or not.
@param[in] log redo log
@param[in] lsn wait until log.write_lsn >= lsn
@param[in,out] interrupted if true, was interrupted, needs retry.
@return statistics related to waiting inside */
static Wait_stats log_wait_for_write(const log_t &log, lsn_t lsn,
bool *interrupted) {
os_event_set(log.writer_event);
const uint64_t max_spins = log_max_spins_when_waiting_in_user_thread(
srv_log_wait_for_write_spin_delay);
auto stop_condition = [&log, lsn, interrupted](bool wait) {
if (log.write_lsn.load() >= lsn) {
*interrupted = false;
return true;
}
if (UNIV_UNLIKELY(
log.writer_threads_paused.load(std::memory_order_relaxed))) {
*interrupted = true;
return true;
}
if (wait) {
os_event_set(log.writer_event);
}
ut_d(log_background_write_threads_active_validate(log));
return false;
};
const size_t slot = log_compute_write_event_slot(log, lsn);
const auto wait_stats =
os_event_wait_for(log.write_events[slot], max_spins,
get_srv_log_wait_for_write_timeout(), stop_condition);
MONITOR_INC_WAIT_STATS(MONITOR_LOG_ON_WRITE_, wait_stats);
return wait_stats;
}
/** Waits until redo log is flushed up to provided lsn (or greater).
@param[in] log redo log
@param[in] lsn wait until log.flushed_to_disk_lsn >= lsn
@param[in,out] interrupted if true, was interrupted, needs retry.
@return statistics related to waiting inside */
static Wait_stats log_wait_for_flush(const log_t &log, lsn_t lsn,
bool *interrupted) {
if (log.write_lsn.load(std::memory_order_relaxed) < lsn) {
os_event_set(log.writer_event);
}
os_event_set(log.flusher_event);
uint64_t max_spins = log_max_spins_when_waiting_in_user_thread(
srv_log_wait_for_flush_spin_delay);
if (log.flush_avg_time >= srv_log_wait_for_flush_spin_hwm) {
max_spins = 0;
}
auto stop_condition = [&log, lsn, interrupted](bool wait) {
log_sync_point("log_wait_for_flush_before_flushed_to_disk_lsn");
if (log.flushed_to_disk_lsn.load() >= lsn) {
*interrupted = false;
return true;
}
if (UNIV_UNLIKELY(
log.writer_threads_paused.load(std::memory_order_relaxed))) {
*interrupted = true;
return true;
}
if (wait) {
if (log.write_lsn.load(std::memory_order_relaxed) < lsn) {
os_event_set(log.writer_event);
}
os_event_set(log.flusher_event);
}
log_sync_point("log_wait_for_flush_before_wait");
return false;
};
const size_t slot = log_compute_flush_event_slot(log, lsn);
thd_wait_begin(nullptr, THD_WAIT_GROUP_COMMIT);
const auto wait_stats =
os_event_wait_for(log.flush_events[slot], max_spins,
get_srv_log_wait_for_flush_timeout(), stop_condition);
thd_wait_end(nullptr);
MONITOR_INC_WAIT_STATS(MONITOR_LOG_ON_FLUSH_, wait_stats);
return wait_stats;
}
/** Write the redo log up to a provided lsn by itself, if necessary.
@param[in] log redo log
@param[in] end_lsn lsn to write for
@param[in] flush_to_disk whether the written log should also be flushed
@param[in,out] interrupted if true, was interrupted, needs retry
@return statistics about waiting inside */
static Wait_stats log_self_write_up_to(log_t &log, lsn_t end_lsn,
bool flush_to_disk, bool *interrupted) {
ut_ad(!mutex_own(&(log.writer_mutex)));
uint32_t waits = 0;
*interrupted = false;
lsn_t ready_lsn = log_buffer_ready_for_write_lsn(log);
ulong i = 0;
/* must wait for (ready_lsn >= end_lsn) at first */
while (i < srv_n_spin_wait_rounds && ready_lsn < end_lsn) {
if (srv_spin_wait_delay) {
ut_delay(ut::random_from_interval_fast(0, srv_spin_wait_delay));
}
i++;
ready_lsn = log_buffer_ready_for_write_lsn(log);
}
if (ready_lsn < end_lsn) {
log.recent_written.advance_tail();
ready_lsn = log_buffer_ready_for_write_lsn(log);
}
if (ready_lsn < end_lsn) {
std::this_thread::yield();
ready_lsn = log_buffer_ready_for_write_lsn(log);
}
while (ready_lsn < end_lsn) {
/* wait using event */
log_closer_mutex_enter(log);
if (log.current_ready_waiting_lsn == 0 &&
os_event_is_set(log.closer_event)) {
log.current_ready_waiting_lsn = end_lsn;
log.current_ready_waiting_sig_count = os_event_reset(log.closer_event);
}
const auto sig_count = log.current_ready_waiting_sig_count;
log_closer_mutex_exit(log);
++waits;
os_event_wait_time_low(log.closer_event, std::chrono::milliseconds{100},
sig_count);
log.recent_written.advance_tail();
ready_lsn = log_buffer_ready_for_write_lsn(log);
}
/* NOTE: Currently doesn't do dirty read for (flush_to_disk == true) case,
because the mutex contention also works as the arbitrator for write-IO
(fsync) bandwidth between log files and data files. */
if (!flush_to_disk &&
log.write_lsn.load(std::memory_order_acquire) >= end_lsn) {
return Wait_stats{waits};
}
/* mysql-test compatibility */
log_sync_point("log_wait_for_flush_before_flushed_to_disk_lsn");
log_sync_point("log_wait_for_flush_before_wait");
log_writer_mutex_enter(log);