From bd4dd0cab193c36d7fbc0430562e03d6107c097c Mon Sep 17 00:00:00 2001 From: Badrish Chandramouli Date: Tue, 14 Aug 2018 13:47:05 -0700 Subject: [PATCH] Initial commit --- .gitattributes | 63 + .gitignore | 194 + README.md | 34 + cc/CMakeLists.txt | 102 + cc/CMakeLists.txt.in | 17 + cc/README.md | 68 + cc/benchmark-dir/CMakeLists.txt | 10 + cc/benchmark-dir/README.md | 17 + cc/benchmark-dir/benchmark.cc | 628 +++ cc/benchmark-dir/benchmark.vcxproj | 170 + cc/benchmark-dir/benchmark.vcxproj.filters | 30 + cc/benchmark-dir/file.h | 61 + cc/benchmark-dir/process_ycsb.cc | 38 + cc/playground/CMakeLists.txt | 1 + cc/playground/sum_store-dir/CMakeLists.txt | 9 + .../sum_store-dir/concurrent_recovery_test.h | 276 ++ .../single_threaded_recovery_test.h | 140 + cc/playground/sum_store-dir/sum_store.cc | 77 + cc/playground/sum_store-dir/sum_store.h | 160 + cc/src/CMakeLists.txt | 63 + cc/src/core/address.cc | 12 + cc/src/core/address.h | 177 + cc/src/core/alloc.h | 35 + cc/src/core/async.h | 132 + cc/src/core/async_result_types.h | 60 + cc/src/core/auto_ptr.h | 123 + cc/src/core/checkpoint_locks.h | 192 + cc/src/core/checkpoint_state.h | 166 + cc/src/core/constants.h | 20 + cc/src/core/faster.h | 2558 +++++++++++ cc/src/core/gc_state.h | 40 + cc/src/core/grow_state.h | 44 + cc/src/core/guid.h | 142 + cc/src/core/hash_bucket.h | 201 + cc/src/core/hash_table.h | 294 ++ cc/src/core/internal_contexts.h | 379 ++ cc/src/core/key_hash.h | 54 + cc/src/core/light_epoch.h | 328 ++ cc/src/core/lss_allocator.cc | 169 + cc/src/core/lss_allocator.h | 237 ++ cc/src/core/malloc_fixed_page_size.h | 582 +++ cc/src/core/native_buffer_pool.h | 188 + cc/src/core/persistent_memory_malloc.h | 1021 +++++ cc/src/core/phase.h | 51 + cc/src/core/record.h | 151 + cc/src/core/recovery_status.h | 59 + cc/src/core/state_transitions.h | 162 + cc/src/core/status.h | 30 + cc/src/core/thread.cc | 26 + cc/src/core/thread.h | 103 + cc/src/core/utility.h | 56 + cc/src/device/file_system_disk.h | 527 +++ cc/src/device/null_disk.h | 124 + cc/src/environment/file.h | 10 + cc/src/environment/file_common.h | 60 + cc/src/environment/file_linux.cc | 199 + cc/src/environment/file_linux.h | 254 ++ cc/src/environment/file_windows.cc | 372 ++ cc/src/environment/file_windows.h | 415 ++ cc/test/CMakeLists.txt | 11 + cc/test/in_memory_test.cc | 1912 +++++++++ cc/test/malloc_fixed_page_size_test.cc | 81 + cc/test/paging_queue_test.cc | 27 + cc/test/paging_test.h | 1017 +++++ cc/test/paging_threadpool_test.cc | 27 + cc/test/recovery_queue_test.cc | 31 + cc/test/recovery_test.h | 3753 +++++++++++++++++ cc/test/recovery_threadpool_test.cc | 31 + cc/test/utility_test.cc | 25 + cs/src/FASTER.sln | 149 + cs/src/benchmark/App.config | 17 + cs/src/benchmark/FASTER.benchmark.csproj | 43 + cs/src/benchmark/FasterYcsbBenchmark.cs | 585 +++ cs/src/benchmark/Program.cs | 58 + cs/src/benchmark/Properties/AssemblyInfo.cs | 39 + cs/src/benchmark/RandomGenerator.cs | 85 + cs/src/core/Allocator/IAllocator.cs | 14 + cs/src/core/Allocator/MallocFixedPageSize.cs | 615 +++ .../core/Allocator/PersistentMemoryMalloc.cs | 899 ++++ cs/src/core/Codegen/CompilerBase.cs | 228 + .../core/Codegen/FasterHashTableCompiler.cs | 132 + cs/src/core/Codegen/HashTableManager.cs | 49 + ...MixedBlitManagedFasterHashTableCompiler.cs | 257 ++ cs/src/core/Codegen/RoslynHelpers.cs | 224 + cs/src/core/Codegen/TypeReplacer.cs | 108 + cs/src/core/Codegen/TypeReplacerCompiler.cs | 38 + cs/src/core/Codegen/Utilities.cs | 240 ++ cs/src/core/Device/IDevice.cs | 29 + cs/src/core/Device/ISegmentedDevice.cs | 24 + cs/src/core/Device/LocalStorageDevice.cs | 268 ++ cs/src/core/Device/MemoryDevice.cs | 195 + cs/src/core/Device/NullDevice.cs | 48 + .../Device/SegmentedLocalStorageDevice.cs | 317 ++ cs/src/core/Device/SegmentedNullDevice.cs | 59 + cs/src/core/Device/WrappedDevice.cs | 69 + cs/src/core/Epochs/LightEpoch.cs | 531 +++ cs/src/core/FASTER.core.csproj | 67 + cs/src/core/FASTER.core.nuspec | 33 + cs/src/core/Index/Common/AddressInfo.cs | 99 + cs/src/core/Index/Common/Contexts.cs | 424 ++ cs/src/core/Index/Common/Layout.cs | 144 + cs/src/core/Index/Common/RecordInfo.cs | 241 ++ cs/src/core/Index/FASTER/AsyncIO.cs | 561 +++ cs/src/core/Index/FASTER/Checkpoint.cs | 723 ++++ cs/src/core/Index/FASTER/FASTER.cs | 288 ++ cs/src/core/Index/FASTER/FASTERBase.cs | 709 ++++ cs/src/core/Index/FASTER/FASTERImpl.cs | 1736 ++++++++ cs/src/core/Index/FASTER/FASTERThread.cs | 315 ++ cs/src/core/Index/FASTER/IFASTER.cs | 38 + cs/src/core/Index/FASTER/IndexCheckpoint.cs | 173 + cs/src/core/Index/FASTER/IndexRecovery.cs | 148 + cs/src/core/Index/FASTER/Recovery.cs | 544 +++ cs/src/core/Index/UserCode/Context.cs | 22 + cs/src/core/Index/UserCode/Functions.cs | 154 + cs/src/core/Index/UserCode/Input.cs | 17 + cs/src/core/Index/UserCode/Key.cs | 170 + cs/src/core/Index/UserCode/Output.cs | 23 + cs/src/core/Index/UserCode/Value.cs | 314 ++ .../core/ManagedLayer/BlittableTypeWrapper.cs | 32 + cs/src/core/ManagedLayer/FASTERFactory.cs | 58 + cs/src/core/ManagedLayer/IFASTERKey.cs | 16 + cs/src/core/ManagedLayer/IFASTERValue.cs | 14 + cs/src/core/ManagedLayer/IFASTER_Mixed.cs | 33 + cs/src/core/ManagedLayer/IManagedFAST.cs | 30 + cs/src/core/ManagedLayer/IUserFunctions.cs | 24 + .../core/ManagedLayer/MixedContextWrapper.cs | 43 + .../ManagedLayer/MixedFunctionsWrapper.cs | 132 + cs/src/core/ManagedLayer/MixedInputWrapper.cs | 55 + cs/src/core/ManagedLayer/MixedKeyWrapper.cs | 133 + cs/src/core/ManagedLayer/MixedManagedFAST.cs | 481 +++ .../core/ManagedLayer/MixedOutputWrapper.cs | 38 + .../core/ManagedLayer/MixedUnwrappedTypes.cs | 96 + .../core/ManagedLayer/MixedUserFunctions.cs | 44 + cs/src/core/ManagedLayer/MixedValueWrapper.cs | 345 ++ cs/src/core/Properties/AssemblyInfo.cs | 24 + cs/src/core/Properties/Resources.Designer.cs | 713 ++++ cs/src/core/Properties/Resources.resx | 190 + cs/src/core/Utilities/AsyncResultTypes.cs | 152 + cs/src/core/Utilities/Native32.cs | 364 ++ cs/src/core/Utilities/NativeBufferPool.cs | 144 + .../Utilities/SafeConcurrentDictionary.cs | 233 + cs/src/core/Utilities/StateTransitions.cs | 73 + cs/src/core/Utilities/Status.cs | 29 + cs/src/core/Utilities/Utility.cs | 243 ++ cs/src/native/adv-file-ops/adv-file-ops.cpp | 134 + .../native/adv-file-ops/adv-file-ops.vcxproj | 80 + cs/src/native/readtsc/readtsc.cpp | 10 + cs/src/native/readtsc/readtsc.vcxproj | 89 + cs/src/native/readtsc/readtsc.vcxproj.filters | 22 + .../playground/ClassCache/ClassCache.csproj | 15 + cs/src/playground/ClassCache/Program.cs | 81 + cs/src/playground/ClassCache/Types.cs | 122 + cs/src/playground/ManagedSample1/App.config | 6 + cs/src/playground/ManagedSample1/Functions.cs | 89 + .../ManagedSample1/ICustomFaster.cs | 29 + .../playground/ManagedSample1/InputStruct.cs | 18 + cs/src/playground/ManagedSample1/KeyStruct.cs | 66 + .../ManagedSample1/ManagedSample1.csproj | 39 + .../playground/ManagedSample1/OutputStruct.cs | 16 + cs/src/playground/ManagedSample1/Program.cs | 56 + .../ManagedSample1/Properties/AssemblyInfo.cs | 22 + .../playground/ManagedSample1/ValueStruct.cs | 72 + cs/src/playground/ManagedSample2/App.config | 6 + .../ManagedSample2/CustomFunctions.cs | 48 + .../playground/ManagedSample2/CustomTypes.cs | 29 + .../ManagedSample2/ManagedSample2.csproj | 39 + cs/src/playground/ManagedSample2/Program.cs | 53 + .../ManagedSample2/Properties/AssemblyInfo.cs | 23 + cs/src/playground/ManagedSample3/App.config | 6 + .../ManagedSample3/ManagedSample3.csproj | 44 + cs/src/playground/ManagedSample3/Program.cs | 144 + .../ManagedSample3/Properties/AssemblyInfo.cs | 22 + cs/src/playground/ManagedSample4/App.config | 6 + .../ManagedSample4/ManagedSample4.csproj | 38 + cs/src/playground/ManagedSample4/Program.cs | 158 + cs/src/playground/NestedTypesTest/App.config | 6 + .../playground/NestedTypesTest/Functions.cs | 172 + .../NestedTypesTest/NestedTypesTest.csproj | 45 + cs/src/playground/NestedTypesTest/Program.cs | 129 + cs/src/playground/NestedTypesTest/Types.cs | 353 ++ cs/src/playground/SumStore/AdId.cs | 62 + cs/src/playground/SumStore/App.config | 9 + .../SumStore/ConcurrentRecoveryTest.cs | 362 ++ cs/src/playground/SumStore/ConcurrentTest.cs | 244 ++ cs/src/playground/SumStore/Functions.cs | 87 + cs/src/playground/SumStore/ICustomFaster.cs | 36 + cs/src/playground/SumStore/Input.cs | 19 + cs/src/playground/SumStore/NumClicks.cs | 69 + cs/src/playground/SumStore/Output.cs | 16 + cs/src/playground/SumStore/Program.cs | 67 + .../SumStore/Properties/AssemblyInfo.cs | 39 + .../SumStore/SingleThreadedRecoveryTest.cs | 171 + cs/src/playground/SumStore/SumStore.csproj | 70 + cs/src/test/BasicFASTERTests.cs | 137 + cs/src/test/ComponentRecoveryTests.cs | 150 + cs/src/test/FASTER.test.csproj | 127 + cs/src/test/FullRecoveryTests.cs | 205 + cs/src/test/ObjectFASTERTests.cs | 69 + cs/src/test/ObjectTestTypes.cs | 119 + cs/src/test/Properties/AssemblyInfo.cs | 39 + cs/src/test/RecoveryTestTypes.cs | 253 ++ cs/src/test/TestTypes.cs | 253 ++ cs/src/test/app.config | 21 + 203 files changed, 39739 insertions(+) create mode 100644 .gitattributes create mode 100644 .gitignore create mode 100644 README.md create mode 100644 cc/CMakeLists.txt create mode 100644 cc/CMakeLists.txt.in create mode 100644 cc/README.md create mode 100644 cc/benchmark-dir/CMakeLists.txt create mode 100644 cc/benchmark-dir/README.md create mode 100644 cc/benchmark-dir/benchmark.cc create mode 100644 cc/benchmark-dir/benchmark.vcxproj create mode 100644 cc/benchmark-dir/benchmark.vcxproj.filters create mode 100644 cc/benchmark-dir/file.h create mode 100644 cc/benchmark-dir/process_ycsb.cc create mode 100644 cc/playground/CMakeLists.txt create mode 100644 cc/playground/sum_store-dir/CMakeLists.txt create mode 100644 cc/playground/sum_store-dir/concurrent_recovery_test.h create mode 100644 cc/playground/sum_store-dir/single_threaded_recovery_test.h create mode 100644 cc/playground/sum_store-dir/sum_store.cc create mode 100644 cc/playground/sum_store-dir/sum_store.h create mode 100644 cc/src/CMakeLists.txt create mode 100644 cc/src/core/address.cc create mode 100644 cc/src/core/address.h create mode 100644 cc/src/core/alloc.h create mode 100644 cc/src/core/async.h create mode 100644 cc/src/core/async_result_types.h create mode 100644 cc/src/core/auto_ptr.h create mode 100644 cc/src/core/checkpoint_locks.h create mode 100644 cc/src/core/checkpoint_state.h create mode 100644 cc/src/core/constants.h create mode 100644 cc/src/core/faster.h create mode 100644 cc/src/core/gc_state.h create mode 100644 cc/src/core/grow_state.h create mode 100644 cc/src/core/guid.h create mode 100644 cc/src/core/hash_bucket.h create mode 100644 cc/src/core/hash_table.h create mode 100644 cc/src/core/internal_contexts.h create mode 100644 cc/src/core/key_hash.h create mode 100644 cc/src/core/light_epoch.h create mode 100644 cc/src/core/lss_allocator.cc create mode 100644 cc/src/core/lss_allocator.h create mode 100644 cc/src/core/malloc_fixed_page_size.h create mode 100644 cc/src/core/native_buffer_pool.h create mode 100644 cc/src/core/persistent_memory_malloc.h create mode 100644 cc/src/core/phase.h create mode 100644 cc/src/core/record.h create mode 100644 cc/src/core/recovery_status.h create mode 100644 cc/src/core/state_transitions.h create mode 100644 cc/src/core/status.h create mode 100644 cc/src/core/thread.cc create mode 100644 cc/src/core/thread.h create mode 100644 cc/src/core/utility.h create mode 100644 cc/src/device/file_system_disk.h create mode 100644 cc/src/device/null_disk.h create mode 100644 cc/src/environment/file.h create mode 100644 cc/src/environment/file_common.h create mode 100644 cc/src/environment/file_linux.cc create mode 100644 cc/src/environment/file_linux.h create mode 100644 cc/src/environment/file_windows.cc create mode 100644 cc/src/environment/file_windows.h create mode 100644 cc/test/CMakeLists.txt create mode 100644 cc/test/in_memory_test.cc create mode 100644 cc/test/malloc_fixed_page_size_test.cc create mode 100644 cc/test/paging_queue_test.cc create mode 100644 cc/test/paging_test.h create mode 100644 cc/test/paging_threadpool_test.cc create mode 100644 cc/test/recovery_queue_test.cc create mode 100644 cc/test/recovery_test.h create mode 100644 cc/test/recovery_threadpool_test.cc create mode 100644 cc/test/utility_test.cc create mode 100644 cs/src/FASTER.sln create mode 100644 cs/src/benchmark/App.config create mode 100644 cs/src/benchmark/FASTER.benchmark.csproj create mode 100644 cs/src/benchmark/FasterYcsbBenchmark.cs create mode 100644 cs/src/benchmark/Program.cs create mode 100644 cs/src/benchmark/Properties/AssemblyInfo.cs create mode 100644 cs/src/benchmark/RandomGenerator.cs create mode 100644 cs/src/core/Allocator/IAllocator.cs create mode 100644 cs/src/core/Allocator/MallocFixedPageSize.cs create mode 100644 cs/src/core/Allocator/PersistentMemoryMalloc.cs create mode 100644 cs/src/core/Codegen/CompilerBase.cs create mode 100644 cs/src/core/Codegen/FasterHashTableCompiler.cs create mode 100644 cs/src/core/Codegen/HashTableManager.cs create mode 100644 cs/src/core/Codegen/MixedBlitManagedFasterHashTableCompiler.cs create mode 100644 cs/src/core/Codegen/RoslynHelpers.cs create mode 100644 cs/src/core/Codegen/TypeReplacer.cs create mode 100644 cs/src/core/Codegen/TypeReplacerCompiler.cs create mode 100644 cs/src/core/Codegen/Utilities.cs create mode 100644 cs/src/core/Device/IDevice.cs create mode 100644 cs/src/core/Device/ISegmentedDevice.cs create mode 100644 cs/src/core/Device/LocalStorageDevice.cs create mode 100644 cs/src/core/Device/MemoryDevice.cs create mode 100644 cs/src/core/Device/NullDevice.cs create mode 100644 cs/src/core/Device/SegmentedLocalStorageDevice.cs create mode 100644 cs/src/core/Device/SegmentedNullDevice.cs create mode 100644 cs/src/core/Device/WrappedDevice.cs create mode 100644 cs/src/core/Epochs/LightEpoch.cs create mode 100644 cs/src/core/FASTER.core.csproj create mode 100644 cs/src/core/FASTER.core.nuspec create mode 100644 cs/src/core/Index/Common/AddressInfo.cs create mode 100644 cs/src/core/Index/Common/Contexts.cs create mode 100644 cs/src/core/Index/Common/Layout.cs create mode 100644 cs/src/core/Index/Common/RecordInfo.cs create mode 100644 cs/src/core/Index/FASTER/AsyncIO.cs create mode 100644 cs/src/core/Index/FASTER/Checkpoint.cs create mode 100644 cs/src/core/Index/FASTER/FASTER.cs create mode 100644 cs/src/core/Index/FASTER/FASTERBase.cs create mode 100644 cs/src/core/Index/FASTER/FASTERImpl.cs create mode 100644 cs/src/core/Index/FASTER/FASTERThread.cs create mode 100644 cs/src/core/Index/FASTER/IFASTER.cs create mode 100644 cs/src/core/Index/FASTER/IndexCheckpoint.cs create mode 100644 cs/src/core/Index/FASTER/IndexRecovery.cs create mode 100644 cs/src/core/Index/FASTER/Recovery.cs create mode 100644 cs/src/core/Index/UserCode/Context.cs create mode 100644 cs/src/core/Index/UserCode/Functions.cs create mode 100644 cs/src/core/Index/UserCode/Input.cs create mode 100644 cs/src/core/Index/UserCode/Key.cs create mode 100644 cs/src/core/Index/UserCode/Output.cs create mode 100644 cs/src/core/Index/UserCode/Value.cs create mode 100644 cs/src/core/ManagedLayer/BlittableTypeWrapper.cs create mode 100644 cs/src/core/ManagedLayer/FASTERFactory.cs create mode 100644 cs/src/core/ManagedLayer/IFASTERKey.cs create mode 100644 cs/src/core/ManagedLayer/IFASTERValue.cs create mode 100644 cs/src/core/ManagedLayer/IFASTER_Mixed.cs create mode 100644 cs/src/core/ManagedLayer/IManagedFAST.cs create mode 100644 cs/src/core/ManagedLayer/IUserFunctions.cs create mode 100644 cs/src/core/ManagedLayer/MixedContextWrapper.cs create mode 100644 cs/src/core/ManagedLayer/MixedFunctionsWrapper.cs create mode 100644 cs/src/core/ManagedLayer/MixedInputWrapper.cs create mode 100644 cs/src/core/ManagedLayer/MixedKeyWrapper.cs create mode 100644 cs/src/core/ManagedLayer/MixedManagedFAST.cs create mode 100644 cs/src/core/ManagedLayer/MixedOutputWrapper.cs create mode 100644 cs/src/core/ManagedLayer/MixedUnwrappedTypes.cs create mode 100644 cs/src/core/ManagedLayer/MixedUserFunctions.cs create mode 100644 cs/src/core/ManagedLayer/MixedValueWrapper.cs create mode 100644 cs/src/core/Properties/AssemblyInfo.cs create mode 100644 cs/src/core/Properties/Resources.Designer.cs create mode 100644 cs/src/core/Properties/Resources.resx create mode 100644 cs/src/core/Utilities/AsyncResultTypes.cs create mode 100644 cs/src/core/Utilities/Native32.cs create mode 100644 cs/src/core/Utilities/NativeBufferPool.cs create mode 100644 cs/src/core/Utilities/SafeConcurrentDictionary.cs create mode 100644 cs/src/core/Utilities/StateTransitions.cs create mode 100644 cs/src/core/Utilities/Status.cs create mode 100644 cs/src/core/Utilities/Utility.cs create mode 100644 cs/src/native/adv-file-ops/adv-file-ops.cpp create mode 100644 cs/src/native/adv-file-ops/adv-file-ops.vcxproj create mode 100644 cs/src/native/readtsc/readtsc.cpp create mode 100644 cs/src/native/readtsc/readtsc.vcxproj create mode 100644 cs/src/native/readtsc/readtsc.vcxproj.filters create mode 100644 cs/src/playground/ClassCache/ClassCache.csproj create mode 100644 cs/src/playground/ClassCache/Program.cs create mode 100644 cs/src/playground/ClassCache/Types.cs create mode 100644 cs/src/playground/ManagedSample1/App.config create mode 100644 cs/src/playground/ManagedSample1/Functions.cs create mode 100644 cs/src/playground/ManagedSample1/ICustomFaster.cs create mode 100644 cs/src/playground/ManagedSample1/InputStruct.cs create mode 100644 cs/src/playground/ManagedSample1/KeyStruct.cs create mode 100644 cs/src/playground/ManagedSample1/ManagedSample1.csproj create mode 100644 cs/src/playground/ManagedSample1/OutputStruct.cs create mode 100644 cs/src/playground/ManagedSample1/Program.cs create mode 100644 cs/src/playground/ManagedSample1/Properties/AssemblyInfo.cs create mode 100644 cs/src/playground/ManagedSample1/ValueStruct.cs create mode 100644 cs/src/playground/ManagedSample2/App.config create mode 100644 cs/src/playground/ManagedSample2/CustomFunctions.cs create mode 100644 cs/src/playground/ManagedSample2/CustomTypes.cs create mode 100644 cs/src/playground/ManagedSample2/ManagedSample2.csproj create mode 100644 cs/src/playground/ManagedSample2/Program.cs create mode 100644 cs/src/playground/ManagedSample2/Properties/AssemblyInfo.cs create mode 100644 cs/src/playground/ManagedSample3/App.config create mode 100644 cs/src/playground/ManagedSample3/ManagedSample3.csproj create mode 100644 cs/src/playground/ManagedSample3/Program.cs create mode 100644 cs/src/playground/ManagedSample3/Properties/AssemblyInfo.cs create mode 100644 cs/src/playground/ManagedSample4/App.config create mode 100644 cs/src/playground/ManagedSample4/ManagedSample4.csproj create mode 100644 cs/src/playground/ManagedSample4/Program.cs create mode 100644 cs/src/playground/NestedTypesTest/App.config create mode 100644 cs/src/playground/NestedTypesTest/Functions.cs create mode 100644 cs/src/playground/NestedTypesTest/NestedTypesTest.csproj create mode 100644 cs/src/playground/NestedTypesTest/Program.cs create mode 100644 cs/src/playground/NestedTypesTest/Types.cs create mode 100644 cs/src/playground/SumStore/AdId.cs create mode 100644 cs/src/playground/SumStore/App.config create mode 100644 cs/src/playground/SumStore/ConcurrentRecoveryTest.cs create mode 100644 cs/src/playground/SumStore/ConcurrentTest.cs create mode 100644 cs/src/playground/SumStore/Functions.cs create mode 100644 cs/src/playground/SumStore/ICustomFaster.cs create mode 100644 cs/src/playground/SumStore/Input.cs create mode 100644 cs/src/playground/SumStore/NumClicks.cs create mode 100644 cs/src/playground/SumStore/Output.cs create mode 100644 cs/src/playground/SumStore/Program.cs create mode 100644 cs/src/playground/SumStore/Properties/AssemblyInfo.cs create mode 100644 cs/src/playground/SumStore/SingleThreadedRecoveryTest.cs create mode 100644 cs/src/playground/SumStore/SumStore.csproj create mode 100644 cs/src/test/BasicFASTERTests.cs create mode 100644 cs/src/test/ComponentRecoveryTests.cs create mode 100644 cs/src/test/FASTER.test.csproj create mode 100644 cs/src/test/FullRecoveryTests.cs create mode 100644 cs/src/test/ObjectFASTERTests.cs create mode 100644 cs/src/test/ObjectTestTypes.cs create mode 100644 cs/src/test/Properties/AssemblyInfo.cs create mode 100644 cs/src/test/RecoveryTestTypes.cs create mode 100644 cs/src/test/TestTypes.cs create mode 100644 cs/src/test/app.config diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 000000000..1ff0c4230 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,63 @@ +############################################################################### +# Set default behavior to automatically normalize line endings. +############################################################################### +* text=auto + +############################################################################### +# Set default behavior for command prompt diff. +# +# This is need for earlier builds of msysgit that does not have it on by +# default for csharp files. +# Note: This is only used by command line +############################################################################### +#*.cs diff=csharp + +############################################################################### +# Set the merge driver for project and solution files +# +# Merging from the command prompt will add diff markers to the files if there +# are conflicts (Merging from VS is not affected by the settings below, in VS +# the diff markers are never inserted). Diff markers may cause the following +# file extensions to fail to load in VS. An alternative would be to treat +# these files as binary and thus will always conflict and require user +# intervention with every merge. To do so, just uncomment the entries below +############################################################################### +#*.sln merge=binary +#*.csproj merge=binary +#*.vbproj merge=binary +#*.vcxproj merge=binary +#*.vcproj merge=binary +#*.dbproj merge=binary +#*.fsproj merge=binary +#*.lsproj merge=binary +#*.wixproj merge=binary +#*.modelproj merge=binary +#*.sqlproj merge=binary +#*.wwaproj merge=binary + +############################################################################### +# behavior for image files +# +# image files are treated as binary by default. +############################################################################### +#*.jpg binary +#*.png binary +#*.gif binary + +############################################################################### +# diff behavior for common document formats +# +# Convert binary document formats to text before diffing them. This feature +# is only available from the command line. Turn it on by uncommenting the +# entries below. +############################################################################### +#*.doc diff=astextplain +#*.DOC diff=astextplain +#*.docx diff=astextplain +#*.DOCX diff=astextplain +#*.dot diff=astextplain +#*.DOT diff=astextplain +#*.pdf diff=astextplain +#*.PDF diff=astextplain +#*.rtf diff=astextplain +#*.RTF diff=astextplain diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..51b2e9ed1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,194 @@ +## Ignore Visual Studio temporary files, build results, and +## files generated by popular Visual Studio add-ons. + +# User-specific files +*.suo +*.user +*.sln.docstates + +# Build results +[Dd]ebug/ +[Dd]ebugPublic/ +[Rr]elease/ +x64/ +build/ +bld/ +[Bb]in/ +[Oo]bj/ + +# Roslyn cache directories +*.ide/ + +# MSTest test Results +[Tt]est[Rr]esult*/ +[Bb]uild[Ll]og.* + +#NUNIT +*.VisualState.xml +TestResult.xml + +# Build Results of an ATL Project +[Dd]ebugPS/ +[Rr]eleasePS/ +dlldata.c + +*_i.c +*_p.c +*_i.h +*.ilk +*.meta +*.obj +*.pch +*.pgc +*.pgd +*.rsp +*.sbr +*.tlb +*.tli +*.tlh +*.tmp +*.tmp_proj +*.log +*.vspscc +*.vssscc +.builds +*.pidb +*.svclog +*.scc + +# Chutzpah Test files +_Chutzpah* + +# Visual C++ cache files +ipch/ +*.aps +*.ncb +*.opensdf +*.sdf +*.cachefile + +# Visual Studio profiler +*.psess +*.vsp +*.vspx + +# TFS 2012 Local Workspace +$tf/ + +# Guidance Automation Toolkit +*.gpState + +# ReSharper is a .NET coding add-in +_ReSharper*/ +*.[Rr]e[Ss]harper +*.DotSettings.user + +# JustCode is a .NET coding addin-in +.JustCode + +# TeamCity is a build add-in +_TeamCity* + +# DotCover is a Code Coverage Tool +*.dotCover + +# NCrunch +_NCrunch_* +.*crunch*.local.xml + +# MightyMoose +*.mm.* +AutoTest.Net/ + +# Web workbench (sass) +.sass-cache/ + +# Installshield output folder +[Ee]xpress/ + +# DocProject is a documentation generator add-in +DocProject/buildhelp/ +DocProject/Help/*.HxT +DocProject/Help/*.HxC +DocProject/Help/*.hhc +DocProject/Help/*.hhk +DocProject/Help/*.hhp +DocProject/Help/Html2 +DocProject/Help/html + +# Click-Once directory +publish/ + +# Publish Web Output +*.[Pp]ublish.xml +*.azurePubxml +## TODO: Comment the next line if you want to checkin your +## web deploy settings but do note that will include unencrypted +## passwords +#*.pubxml + +# NuGet Packages Directory +packages/* +## TODO: If the tool you use requires repositories.config +## uncomment the next line +#!packages/repositories.config + +# Enable "build/" folder in the NuGet Packages folder since +# NuGet packages use it for MSBuild targets. +# This line needs to be after the ignore of the build folder +# (and the packages folder if the line above has been uncommented) +!packages/build/ + +# Windows Azure Build Output +csx/ +*.build.csdef + +# Windows Store app package directory +AppPackages/ + +# Others +sql/ +*.Cache +ClientBin/ +[Ss]tyle[Cc]op.* +~$* +*~ +*.dbmdl +*.dbproj.schemaview +*.pfx +*.publishsettings +node_modules/ + +# RIA/Silverlight projects +Generated_Code/ + +# Backup & report files from converting an old project file +# to a newer Visual Studio version. Backup files are not needed, +# because we have git ;-) +_UpgradeReport_Files/ +Backup*/ +UpgradeLog*.XML +UpgradeLog*.htm + +# SQL Server files +*.mdf +*.ldf + +# Business Intelligence projects +*.rdl.data +*.bim.layout +*.bim_*.settings + +# Microsoft Fakes +FakesAssemblies/ + +# LightSwitch generated files +GeneratedArtifacts/ +_Pvt_Extensions/ +ModelManifest.xml +packages/ +*.VC.db* +*.VC.opendb +/.vs/ +/cs/src/.vs/ +*.lib \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 000000000..46e0bf211 --- /dev/null +++ b/README.md @@ -0,0 +1,34 @@ +# Introduction + +Managing large application state easily and with high performance is one of the hardest problems +in the cloud today. We present FASTER, a new concurrent key-value store designed for point lookups +and heavy updates. FASTER supports data larger than memory, by leveraging fast external storage. +What differentiates FASTER are its cache-optimized index that achieves very high performance — up +to 160 million operations per second when data fits in memory; its unique “hybrid record log” design +that combines a traditional persistent log with in-place updates, to shape the memory working set +and retain performance; and its architecture as an component that can be embedded in cloud apps. FASTER +achieves higher throughput than current systems, by more than two orders of magnitude, and scales better +than current pure in-memory data structures, for in-memory working sets. FASTER also offers a new consistent +recovery scheme that achieves better performance at the expense of slightly higher commit latency. + +# Getting Started + +Go to [our website](http://aka.ms/FASTER) for more details and papers. + +# Build and Test in C# + +Clone the repo, open /cs/src/FASTER.sln, build using VS 2017. + +# Contributing + +This project welcomes contributions and suggestions. Most contributions require you to agree to a +Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us +the rights to use your contribution. For details, visit https://cla.microsoft.com. + +When you submit a pull request, a CLA-bot will automatically determine whether you need to provide +a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions +provided by the bot. You will only need to do this once across all repos using our CLA. + +This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). +For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or +contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. \ No newline at end of file diff --git a/cc/CMakeLists.txt b/cc/CMakeLists.txt new file mode 100644 index 000000000..8a8b8d7e6 --- /dev/null +++ b/cc/CMakeLists.txt @@ -0,0 +1,102 @@ +cmake_minimum_required (VERSION 3.2.2) + +enable_testing() + +include(ExternalProject) +project(FASTER) + +if (MSVC) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi /nologo /Gm- /W3 /WX /EHsc /GS /fp:precise /permissive- /Zc:wchar_t /Zc:forScope /Zc:inline /Gd /TP") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /FC /wd4996") + + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Od /RTC1 /MDd") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /O2 /Oi /Gy- /MD") + + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /DEBUG /OPT:REF /OPT:NOICF /INCREMENTAL:NO") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /DEBUG /OPT:REF /OPT:NOICF /INCREMENTAL:NO") +else() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14") + + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Og -g -D_DEBUG") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -g") +endif() + +#Always set _DEBUG compiler directive when compiling bits regardless of target OS +set_directory_properties(PROPERTIES COMPILE_DEFINITIONS_DEBUG "_DEBUG") + +##### BEGIN GOOGLE TEST INSTALLATION ##### +# Copied from https://github.com/google/googletest/tree/master/googletest#incorporating-into-an-existing-cmake-project +# Download and unpack googletest at configure time +configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt) +execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . + RESULT_VARIABLE result + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/googletest-download ) +if(result) + message(FATAL_ERROR "CMake step for googletest failed: ${result}") +endif() +execute_process(COMMAND ${CMAKE_COMMAND} --build . + RESULT_VARIABLE result + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/googletest-download ) +if(result) + message(FATAL_ERROR "Build step for googletest failed: ${result}") +endif() + +# Prevent overriding the parent project's compiler/linker +# settings on Windows +set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) + +# Add googletest directly to our build. This defines +# the gtest and gtest_main targets. +add_subdirectory(${CMAKE_BINARY_DIR}/googletest-src + ${CMAKE_BINARY_DIR}/googletest-build + EXCLUDE_FROM_ALL) + +##### END GOOGLE TEST INSTALLATION ##### + +include_directories(${CMAKE_SOURCE_DIR}/src) + +# Set the directory targets when build in libs and binaries +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + +set (FAST_LINK_LIBS + faster +) + +# Set the link libraries to for test compilation +set (FAST_TEST_LINK_LIBS ${FAST_LINK_LIBS} gtest) +if(WIN32) +else() + set (FAST_TEST_LINK_LIBS ${FAST_TEST_LINK_LIBS} stdc++fs uuid tbb gcc aio m stdc++ pthread) +endif() + +# Set the link libraries to for benchmark binary compilation +set (FAST_BENCHMARK_LINK_LIBS ${FAST_LINK_LIBS}) +if(WIN32) +set (FAST_BENCHMARK_LINK_LIBS ${FAST_LINK_LIBS} wsock32 Ws2_32) +else() + set (FAST_BENCHMARK_LINK_LIBS ${FAST_BENCHMARK_LINK_LIBS} stdc++fs uuid tbb gcc aio m stdc++ pthread) +endif() + +#Function to automate building test binaries +FUNCTION(ADD_FAST_TEST TEST_NAME HEADERS) + add_executable(${TEST_NAME} ${HEADERS} ${TEST_NAME}.cc) + + target_link_libraries(${TEST_NAME} ${FAST_TEST_LINK_LIBS}) + add_test(${TEST_NAME} ${CMAKE_BINARY_DIR}/${TEST_NAME}) +ENDFUNCTION() + +#Function to automate building benchmark binaries +FUNCTION(ADD_FAST_BENCHMARK BENCHMARK_NAME) + add_executable(${BENCHMARK_NAME} ${BENCHMARK_HEADERS} ${BENCHMARK_NAME}.cc) + + target_link_libraries(${BENCHMARK_NAME} ${FAST_BENCHMARK_LINK_LIBS}) +ENDFUNCTION() + +# Build each subdirectory +add_subdirectory(benchmark-dir) +add_subdirectory(playground) +add_subdirectory(src) +add_subdirectory(test) + diff --git a/cc/CMakeLists.txt.in b/cc/CMakeLists.txt.in new file mode 100644 index 000000000..30a6ab711 --- /dev/null +++ b/cc/CMakeLists.txt.in @@ -0,0 +1,17 @@ +# Copied from https://github.com/google/googletest/tree/master/googletest#incorporating-into-an-existing-cmake-project + +cmake_minimum_required(VERSION 2.8.2) + +project(googletest-download NONE) + +include(ExternalProject) +ExternalProject_Add(googletest + GIT_REPOSITORY https://github.com/google/googletest.git + GIT_TAG master + SOURCE_DIR "${CMAKE_BINARY_DIR}/googletest-src" + BINARY_DIR "${CMAKE_BINARY_DIR}/googletest-build" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" +) diff --git a/cc/README.md b/cc/README.md new file mode 100644 index 000000000..7e5a57579 --- /dev/null +++ b/cc/README.md @@ -0,0 +1,68 @@ +Building C++ FASTER +=================== +The C++ version of FASTER uses CMake for builds. To build C++ FASTER, create +one or more build directories and use CMake to set up build scripts for your +target OS. Once CMake has generated the build scripts, it will try to update +them, as needed, during ordinary build. + +Building on Windows +------------------- +Create new directory "build" off the root directory (FAST\cc). From the new +"build" directory, execute: + + cmake .. -G " Win64" + +To see a list of supported MSVC compiler versions, just run "cmake -G". As of +this writing, we're using Visual Studio 2017, so you would execute: + + cmake .. -G "Visual Studio 15 2017 Win64" + +That will create build scripts inside your new "build" directory, including +a "FASTER.sln" file that you can use inside Visual Studio. CMake will add several +build profiles to FASTER.sln, including Debug/x64 and Release/x64. + +Building on Linux +----------------- +The Linux build requires several packages (both libraries and header files); +see "CMakeFiles.txt" in the root directory (FAST/cc) for the list of libraries +being linked to, on Linux. + +As of this writing, the required libraries are: + - stdc++fs : for , used for cross-platform directory + creation. + - uuid : support for GUIDs. + - tbb : Intel's Thread Building Blocks library, used for concurrent_queue. + - gcc + - aio : Kernel Async I/O, used by QueueFile / QueueIoHandler. + - stdc++ + - pthread : thread library. + +Also, CMake on Linux, for the gcc compiler, generates build scripts for either +Debug or Release build, but not both; so you'll have to run CMake twice, in two +different directories, to get both Debug and Release build scripts. + +Create new directories "build/Debug" and "build/Release" off the root directory +(FAST/cc). From "build/Debug", run: + + cmake -DCMAKE_BUILD_TYPE=Debug ../.. + +--and from "build/Release", run: + + cmake -DCMAKE_BUILD_TYPE=Release ../.. + +Then you can build Debug or Release binaries by running "make" inside the +relevant build directory. + +Other options +------------- +You can try other generators (compilers) supported by CMake. The main CMake +build script is the CMakeLists.txt located in the root directory (FAST/cc). + +Examples +======== +There are some unit tests in FAST/cc/test. + +Sum-store, located in FAST/cc/playground/sum_store-dir, is a good example of +checkpointing and recovery. + +There's a basic YCSB test driver in FAST/cc/benchmark-dir. diff --git a/cc/benchmark-dir/CMakeLists.txt b/cc/benchmark-dir/CMakeLists.txt new file mode 100644 index 000000000..1629d18ba --- /dev/null +++ b/cc/benchmark-dir/CMakeLists.txt @@ -0,0 +1,10 @@ +set(BENCHMARK_HEADERS + file.h +) + +set(BENCHMARK_SOURCES +) + +ADD_FAST_BENCHMARK(benchmark) + +add_executable(process_ycsb process_ycsb.cc) diff --git a/cc/benchmark-dir/README.md b/cc/benchmark-dir/README.md new file mode 100644 index 000000000..3fb5f8e4d --- /dev/null +++ b/cc/benchmark-dir/README.md @@ -0,0 +1,17 @@ +Setting up YCSB +=============== +First, download and install YCSB, from +https://github.com/brianfrankcooper/YCSB/ . Configure YCSB for your intended +workload, and run the "basic" driver (both "load" and "run," as required), +redirecting the output to a file. + +The output of YCSB's "basic" driver is verbose. A typical line looks like: + + INSERT usertable user5575651532496486335 [ field1='...' ... ] + +To speed up file ingestion, our basic YCSB benchmark assumes that the input +file consists only of the 8-byte-integer portion of the key--e.g.: + + 5575651532496486335 + +To convert YCSB "basic" output to the format we expect, run "process_ycsb." diff --git a/cc/benchmark-dir/benchmark.cc b/cc/benchmark-dir/benchmark.cc new file mode 100644 index 000000000..29090237c --- /dev/null +++ b/cc/benchmark-dir/benchmark.cc @@ -0,0 +1,628 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#include +#include +#include +#include +#include +#include +#include + +#include "file.h" + +#include "core/auto_ptr.h" +#include "core/faster.h" +#include "device/null_disk.h" + +using namespace std::chrono_literals; +using namespace FASTER::core; + +/// Basic YCSB benchmark. + +enum class Op : uint8_t { + Insert = 0, + Read = 1, + Upsert = 2, + Scan = 3, + ReadModifyWrite = 4, +}; + +enum class Workload { + A_50_50 = 0, + RMW_100 = 1, +}; + +static constexpr uint64_t kInitCount = 250000000; +static constexpr uint64_t kTxnCount = 1000000000; +static constexpr uint64_t kChunkSize = 3200; +static constexpr uint64_t kRefreshInterval = 64; +static constexpr uint64_t kCompletePendingInterval = 1600; + +static_assert(kInitCount % kChunkSize == 0, "kInitCount % kChunkSize != 0"); +static_assert(kTxnCount % kChunkSize == 0, "kTxnCount % kChunkSize != 0"); +static_assert(kCompletePendingInterval % kRefreshInterval == 0, + "kCompletePendingInterval % kRefreshInterval != 0"); + +static constexpr uint64_t kNanosPerSecond = 1000000000; + +static constexpr uint64_t kMaxKey = 268435456; +static constexpr uint64_t kRunSeconds = 360; +static constexpr uint64_t kCheckpointSeconds = 30; + +aligned_unique_ptr_t init_keys_; +aligned_unique_ptr_t txn_keys_; +std::atomic idx_{ 0 }; +std::atomic done_{ false }; +std::atomic total_duration_{ 0 }; +std::atomic total_reads_done_{ 0 }; +std::atomic total_writes_done_{ 0 }; + +class ReadContext; +class UpsertContext; +class RmwContext; + +/// This benchmark stores 8-byte keys in key-value store. +class Key { + public: + Key(uint64_t key) + : key_{ key } { + } + + /// Methods and operators required by the (implicit) interface: + inline static constexpr uint32_t size() { + return static_cast(sizeof(Key)); + } + inline KeyHash GetHash() const { + return KeyHash{ Utility::GetHashCode(key_) }; + } + + /// Comparison operators. + inline bool operator==(const Key& other) const { + return key_ == other.key_; + } + inline bool operator!=(const Key& other) const { + return key_ != other.key_; + } + + private: + uint64_t key_; +}; + +/// This benchmark stores an 8-byte value in the key-value store. +class Value { + public: + Value() + : value_{ 0 } { + } + + Value(const Value& other) + : value_{ other.value_ } { + } + + Value(uint64_t value) + : value_{ value } { + } + + inline static constexpr uint32_t size() { + return static_cast(sizeof(Value)); + } + + friend class ReadContext; + friend class UpsertContext; + friend class RmwContext; + + private: + union { + uint64_t value_; + std::atomic atomic_value_; + }; +}; + +/// Class passed to store_t::Read(). +class ReadContext : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + ReadContext(uint64_t key) + : key_{ key } { + } + + /// Copy (and deep-copy) constructor. + ReadContext(const ReadContext& other) + : key_{ other.key_ } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + + // For this benchmark, we don't copy out, so these are no-ops. + inline void Get(const value_t& value) { } + inline void GetAtomic(const value_t& value) { } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; +}; + +/// Class passed to store_t::Upsert(). +class UpsertContext : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + UpsertContext(uint64_t key, uint64_t input) + : key_{ key } + , input_{ input } { + } + + /// Copy (and deep-copy) constructor. + UpsertContext(const UpsertContext& other) + : key_{ other.key_ } + , input_{ other.input_ } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + inline static constexpr uint32_t value_size() { + return sizeof(value_t); + } + + /// Non-atomic and atomic Put() methods. + inline void Put(value_t& value) { + value.value_ = input_; + } + inline bool PutAtomic(value_t& value) { + value.atomic_value_.store(input_); + return true; + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + uint64_t input_; +}; + +/// Class passed to store_t::RMW(). +class RmwContext : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + RmwContext(uint64_t key, uint64_t incr) + : key_{ key } + , incr_{ incr } { + } + + /// Copy (and deep-copy) constructor. + RmwContext(const RmwContext& other) + : key_{ other.key_ } + , incr_{ other.incr_ } { + } + + /// The implicit and explicit interfaces require a key() accessor. + const Key& key() const { + return key_; + } + inline static constexpr uint32_t value_size() { + return sizeof(value_t); + } + + /// Initial, non-atomic, and atomic RMW methods. + inline void RmwInitial(value_t& value) { + value.value_ = incr_; + } + inline void RmwCopy(const value_t& old_value, value_t& value) { + value.value_ = old_value.value_ + incr_; + } + inline bool RmwAtomic(value_t& value) { + value.atomic_value_.fetch_add(incr_); + return true; + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + uint64_t incr_; +}; + +/// Key-value store, specialized to our key and value types. +#ifdef _WIN32 +typedef FASTER::environment::ThreadPoolIoHandler handler_t; +#else +typedef FASTER::environment::QueueIoHandler handler_t; +#endif +typedef FASTER::device::FileSystemDisk disk_t; +using store_t = FasterKv; + +inline Op ycsb_a_50_50(std::mt19937& rng) { + if(rng() % 100 < 50) { + return Op::Read; + } else { + return Op::Upsert; + } +} + +inline Op ycsb_rmw_100(std::mt19937& rng) { + return Op::ReadModifyWrite; +} + +/// Affinitize to hardware threads on the same core first, before +/// moving on to the next core. +void SetThreadAffinity(size_t core) { + + // For now, assume 36 cores. (Set this correctly for your test system.) + constexpr size_t kCoreCount = 36; +#ifdef _WIN32 + HANDLE thread_handle = ::GetCurrentThread(); + GROUP_AFFINITY group; + group.Group = WORD(core / kCoreCount); + group.Mask = KAFFINITY(0x1llu << (core - kCoreCount * group.Group)); + ::SetThreadGroupAffinity(thread_handle, &group, nullptr); +#else + // On our 28-core test system, we see CPU 0, Core 0 assigned to 0, 28; + // CPU 1, Core 0 assigned to 1, 29; etc. + cpu_set_t mask; + CPU_ZERO(&mask); +#ifdef NUMA + switch(core % 4) { + case 0: + // 0 |-> 0 + // 4 |-> 2 + // 8 |-> 4 + core = core / 2; + break; + case 1: + // 1 |-> 28 + // 5 |-> 30 + // 9 |-> 32 + core = kCoreCount + (core - 1) / 2; + break; + case 2: + // 2 |-> 1 + // 6 |-> 3 + // 10 |-> 5 + core = core / 2; + break; + case 3: + // 3 |-> 29 + // 7 |-> 31 + // 11 |-> 33 + core = kCoreCount + (core - 1) / 2; + break; + } +#else + switch(core % 2) { + case 0: + // 0 |-> 0 + // 2 |-> 2 + // 4 |-> 4 + core = core; + break; + case 1: + // 1 |-> 28 + // 3 |-> 30 + // 5 |-> 32 + core = (core - 1) + kCoreCount; + break; + } +#endif + CPU_SET(core, &mask); + + ::sched_setaffinity(0, sizeof(mask), &mask); +#endif +} + +void load_files(const std::string& load_filename, const std::string& run_filename) { + constexpr size_t kFileChunkSize = 131072; + + auto chunk_guard = alloc_aligned(512, kFileChunkSize); + uint64_t* chunk = chunk_guard.get(); + + FASTER::benchmark::File init_file{ load_filename }; + + printf("loading keys from %s into memory...\n", load_filename.c_str()); + + init_keys_ = alloc_aligned(64, kInitCount * sizeof(uint64_t)); + uint64_t count = 0; + + uint64_t offset = 0; + while(true) { + uint64_t size = init_file.Read(chunk, kFileChunkSize, offset); + for(uint64_t idx = 0; idx < size / 8; ++idx) { + init_keys_.get()[count] = chunk[idx]; + ++count; + } + if(size == kFileChunkSize) { + offset += kFileChunkSize; + } else { + break; + } + } + if(kInitCount != count) { + printf("Init file load fail!\n"); + exit(1); + } + + printf("loaded %" PRIu64 " keys.\n", count); + + FASTER::benchmark::File txn_file{ run_filename }; + + printf("loading txns from %s into memory...\n", run_filename.c_str()); + + txn_keys_ = alloc_aligned(64, kTxnCount * sizeof(uint64_t)); + + count = 0; + offset = 0; + + while(true) { + uint64_t size = txn_file.Read(chunk, kFileChunkSize, offset); + for(uint64_t idx = 0; idx < size / 8; ++idx) { + txn_keys_.get()[count] = chunk[idx]; + ++count; + } + if(size == kFileChunkSize) { + offset += kFileChunkSize; + } else { + break; + } + } + if(kTxnCount != count) { + printf("Txn file load fail!\n"); + exit(1); + } + printf("loaded %" PRIu64 " txns.\n", count); +} + +void thread_setup_store(store_t* store, size_t thread_idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + assert(result == Status::Ok); + }; + + SetThreadAffinity(thread_idx); + + Guid guid = store->StartSession(); + + uint64_t value = 42; + for(uint64_t chunk_idx = idx_.fetch_add(kChunkSize); chunk_idx < kInitCount; + chunk_idx = idx_.fetch_add(kChunkSize)) { + for(uint64_t idx = chunk_idx; idx < chunk_idx + kChunkSize; ++idx) { + if(idx % kRefreshInterval == 0) { + store->Refresh(); + if(idx % kCompletePendingInterval == 0) { + store->CompletePending(false); + } + } + + UpsertContext context{ init_keys_.get()[idx], value }; + store->Upsert(context, callback, 1); + } + } + + store->CompletePending(true); + store->StopSession(); +} + +void setup_store(store_t* store, size_t num_threads) { + idx_ = 0; + std::deque threads; + for(size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) { + threads.emplace_back(&thread_setup_store, store, thread_idx); + } + for(auto& thread : threads) { + thread.join(); + } + + init_keys_.reset(); + + printf("Finished populating store: contains ?? elements.\n"); +} + + +static std::atomic async_reads_done{ 0 }; +static std::atomic async_writes_done{ 0 }; + +template +void thread_run_benchmark(store_t* store, size_t thread_idx) { + SetThreadAffinity(thread_idx); + + std::random_device rd{}; + std::mt19937 rng{ rd() }; + + auto start_time = std::chrono::high_resolution_clock::now(); + + uint64_t upsert_value = 0; + int64_t reads_done = 0; + int64_t writes_done = 0; + + Guid guid = store->StartSession(); + + while(!done_) { + uint64_t chunk_idx = idx_.fetch_add(kChunkSize); + while(chunk_idx >= kTxnCount) { + if(chunk_idx == kTxnCount) { + idx_ = 0; + } + chunk_idx = idx_.fetch_add(kChunkSize); + } + for(uint64_t idx = chunk_idx; idx < chunk_idx + kChunkSize; ++idx) { + if(idx % kRefreshInterval == 0) { + store->Refresh(); + if(idx % kCompletePendingInterval == 0) { + store->CompletePending(false); + } + } + switch(FN(rng)) { + case Op::Insert: + case Op::Upsert: { + auto callback = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + }; + + UpsertContext context{ txn_keys_.get()[idx], upsert_value }; + Status result = store->Upsert(context, callback, 1); + ++writes_done; + break; + } + case Op::Scan: + printf("Scan currently not supported!\n"); + exit(1); + break; + case Op::Read: { + auto callback = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + }; + + ReadContext context{ txn_keys_.get()[idx] }; + + Status result = store->Read(context, callback, 1); + ++reads_done; + break; + } + case Op::ReadModifyWrite: + auto callback = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + }; + + RmwContext context{ txn_keys_.get()[idx], 5 }; + Status result = store->Rmw(context, callback, 1); + if(result == Status::Ok) { + ++writes_done; + } + break; + } + } + } + + store->CompletePending(true); + store->StopSession(); + + auto end_time = std::chrono::high_resolution_clock::now(); + std::chrono::nanoseconds duration = end_time - start_time; + total_duration_ += duration.count(); + total_reads_done_ += reads_done; + total_writes_done_ += writes_done; + printf("Finished thread %" PRIu64 " : %" PRIu64 " reads, %" PRIu64 " writes, in %.2f seconds.\n", + thread_idx, reads_done, writes_done, (double)duration.count() / kNanosPerSecond); +} + +template +void run_benchmark(store_t* store, size_t num_threads) { + idx_ = 0; + total_duration_ = 0; + total_reads_done_ = 0; + total_writes_done_ = 0; + done_ = false; + std::deque threads; + for(size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) { + threads.emplace_back(&thread_run_benchmark, store, thread_idx); + } + + static std::atomic num_checkpoints; + num_checkpoints = 0; + + if(kCheckpointSeconds == 0) { + std::this_thread::sleep_for(std::chrono::seconds(kRunSeconds)); + } else { + auto callback = [](uint64_t persistent_serial_num) { + ++num_checkpoints; + }; + + auto start_time = std::chrono::high_resolution_clock::now(); + auto last_checkpoint_time = start_time; + auto current_time = start_time; + + uint64_t checkpoint_num = 0; + + while(current_time - start_time < std::chrono::seconds(kRunSeconds)) { + std::this_thread::sleep_for(std::chrono::seconds(1)); + current_time = std::chrono::high_resolution_clock::now(); + if(current_time - last_checkpoint_time >= std::chrono::seconds(kCheckpointSeconds)) { + bool success = store->Checkpoint(callback); + if(success) { + printf("Starting checkpoint %" PRIu64 ".\n", checkpoint_num); + ++checkpoint_num; + } else { + printf("Failed to start checkpoint.\n"); + } + last_checkpoint_time = current_time; + } + } + + done_ = true; + } + + for(auto& thread : threads) { + thread.join(); + } + + printf("Finished benchmark: %" PRIu64 " thread checkpoints completed; %.2f ops/second/thread\n", + num_checkpoints.load(), + ((double)total_reads_done_ + (double)total_writes_done_) / ((double)total_duration_ / + kNanosPerSecond)); +} + +void run(Workload workload, size_t num_threads) { + // FASTER store has a hash table with approx. kInitCount / 2 entries, a log of size 16 GB, + // and a null device (it's in-memory only). + size_t init_size = next_power_of_two(kInitCount / 2); + store_t store{ init_size, 17179869184, "storage" }; + + printf("Populating the store...\n"); + + setup_store(&store, num_threads); + + store.DumpDistribution(); + + printf("Running benchmark on %" PRIu64 " threads...\n", num_threads); + switch(workload) { + case Workload::A_50_50: + run_benchmark(&store, num_threads); + break; + case Workload::RMW_100: + run_benchmark(&store, num_threads); + break; + default: + printf("Unknown workload!\n"); + exit(1); + } +} + +int main(int argc, char* argv[]) { + constexpr size_t kNumArgs = 4; + if(argc != kNumArgs + 1) { + printf("Usage: benchmark.exe <# threads> \n"); + exit(0); + } + + Workload workload = static_cast(std::atol(argv[1])); + size_t num_threads = ::atol(argv[2]); + std::string load_filename{ argv[3] }; + std::string run_filename{ argv[4] }; + + load_files(load_filename, run_filename); + + run(workload, num_threads); + + return 0; +} diff --git a/cc/benchmark-dir/benchmark.vcxproj b/cc/benchmark-dir/benchmark.vcxproj new file mode 100644 index 000000000..36b05c245 --- /dev/null +++ b/cc/benchmark-dir/benchmark.vcxproj @@ -0,0 +1,170 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + Debug + x64 + + + Release + x64 + + + + 15.0 + {109C58E8-A9A2-49CC-86F4-64D25FB40773} + Win32Proj + benchmark + 10.0.16299.0 + + + + Application + true + v141 + Unicode + + + Application + false + v141 + true + Unicode + + + Application + true + v141 + Unicode + + + Application + false + v141 + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + NotUsing + Level3 + Disabled + true + _DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + 4996 + true + + + Console + true + + + + + NotUsing + Level3 + Disabled + true + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + 4996 + true + + + Console + true + + + + + NotUsing + Level3 + MaxSpeed + false + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + 4996 + + + Console + true + true + true + + + + + NotUsing + Level3 + MaxSpeed + false + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + 4996 + + + Console + true + true + true + + + + + + + + {419e0f92-c483-416e-ada3-292a1c6cce7c} + + + + + + + + + + \ No newline at end of file diff --git a/cc/benchmark-dir/benchmark.vcxproj.filters b/cc/benchmark-dir/benchmark.vcxproj.filters new file mode 100644 index 000000000..11c6a1e87 --- /dev/null +++ b/cc/benchmark-dir/benchmark.vcxproj.filters @@ -0,0 +1,30 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;hm;inl;inc;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Header Files + + + Header Files + + + + + Source Files + + + \ No newline at end of file diff --git a/cc/benchmark-dir/file.h b/cc/benchmark-dir/file.h new file mode 100644 index 000000000..b50e7e485 --- /dev/null +++ b/cc/benchmark-dir/file.h @@ -0,0 +1,61 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include + +#ifdef _WIN32 +#define NOMINMAX +#define _WINSOCKAPI_ +#include +#else +#include +#include +#include +#include +#endif + +namespace FASTER { +namespace benchmark { + +/// Basic wrapper around synchronous file read. +class File { + public: + File(const std::string& filename) { +#ifdef _WIN32 + file_handle_ = ::CreateFileA(filename.c_str(), GENERIC_READ, FILE_SHARE_READ, nullptr, + OPEN_EXISTING, FILE_FLAG_NO_BUFFERING, nullptr); +#else + fd_ = ::open(filename.c_str(), O_RDONLY | O_DIRECT, S_IRUSR); +#endif + } + + ~File() { +#ifdef _WIN32 + ::CloseHandle(file_handle_); +#else + ::close(fd_); +#endif + } + + size_t Read(void* buf, size_t count, uint64_t offset) { +#ifdef _WIN32 + DWORD bytes_read { 0 }; + ::ReadFile(file_handle_, buf, static_cast(count), &bytes_read, nullptr); + return bytes_read; +#else + return ::pread(fd_, buf, count, offset); +#endif + } + + private: +#ifdef _WIN32 + HANDLE file_handle_; +#else + int fd_; +#endif +}; + +} +} // namespace FASTER::benchmark diff --git a/cc/benchmark-dir/process_ycsb.cc b/cc/benchmark-dir/process_ycsb.cc new file mode 100644 index 000000000..a972a6b2c --- /dev/null +++ b/cc/benchmark-dir/process_ycsb.cc @@ -0,0 +1,38 @@ +#include +#include +#include +#include +#include + +#include +#include +#include + +int main(int argc, char* argv[]) { + if (argc != 3) { + fprintf(stderr, "Requires two arguments: file copied from, file copied to.\n"); + exit(-1); + } + + std::string from_filename{ argv[1] }; + std::string to_filename{ argv[2] }; + + std::ifstream from_file{ from_filename }; + std::ofstream to_file{ to_filename }; + + const std::string prefix{ "usertable user" }; + + while (!from_file.eof()) { + char buffer[256]; + from_file.getline(buffer, sizeof(buffer)); + std::string line{ buffer }; + std::string::size_type pos = line.find(prefix); + if (pos == std::string::npos) { + continue; + } + line = line.substr(pos + prefix.size()); + uint64_t key = stol(line); + + to_file.write(reinterpret_cast(&key), sizeof(key)); + } +} diff --git a/cc/playground/CMakeLists.txt b/cc/playground/CMakeLists.txt new file mode 100644 index 000000000..89fef048e --- /dev/null +++ b/cc/playground/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(sum_store-dir) diff --git a/cc/playground/sum_store-dir/CMakeLists.txt b/cc/playground/sum_store-dir/CMakeLists.txt new file mode 100644 index 000000000..f2e536431 --- /dev/null +++ b/cc/playground/sum_store-dir/CMakeLists.txt @@ -0,0 +1,9 @@ +set(SUM_STORE_HEADERS + concurrent_recovery_test.h + single_threaded_recovery_test.h + sum_store.h +) + +add_executable(sum_store ${SUM_STORE_HEADERS} sum_store.cc) +target_link_libraries(sum_store ${FAST_BENCHMARK_LINK_LIBS}) + diff --git a/cc/playground/sum_store-dir/concurrent_recovery_test.h b/cc/playground/sum_store-dir/concurrent_recovery_test.h new file mode 100644 index 000000000..63007de88 --- /dev/null +++ b/cc/playground/sum_store-dir/concurrent_recovery_test.h @@ -0,0 +1,276 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "core/auto_ptr.h" +#include "core/faster.h" +#include "core/thread.h" +#include "sum_store.h" + +namespace sum_store { + +class ConcurrentRecoveryTest { + public: + static constexpr uint64_t kNumUniqueKeys = (1L << 22); + static constexpr uint64_t kKeySpace = (1L << 14); + static constexpr uint64_t kNumOps = (1L << 25); + static constexpr uint64_t kRefreshInterval = (1L << 8); + static constexpr uint64_t kCompletePendingInterval = (1L << 12); + static constexpr uint64_t kCheckpointInterval = (1L << 22); + + ConcurrentRecoveryTest(store_t& store_, size_t num_threads_) + : store{ store_ } + , num_threads{ num_threads_ } + , num_active_threads{ 0 } + , num_checkpoints{ 0 } { + } + + private: + static void PopulateWorker(store_t* store, size_t thread_idx, + std::atomic* num_active_threads, size_t num_threads, + std::atomic* num_checkpoints) { + auto callback = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + assert(result == Status::Ok); + }; + + auto persistence_callback = [](uint64_t persistent_serial_num) { + printf("Thread %" PRIu32 " reports persistence until %" PRIu64 "\n", + Thread::id(), persistent_serial_num); + }; + + // Register thread with the store + store->StartSession(); + + ++(*num_active_threads); + + // Process the batch of input data + for(size_t idx = 0; idx < kNumOps; ++idx) { + RmwContext context{ idx % kNumUniqueKeys, 1 }; + store->Rmw(context, callback, idx); + if(idx % kCheckpointInterval == 0 && *num_active_threads == num_threads) { + if(store->Checkpoint(persistence_callback)) { + printf("Thread %" PRIu32 " calling Checkpoint(), %" PRIu32 "\n", Thread::id(), + ++(*num_checkpoints)); + } + } + if(idx % kCompletePendingInterval == 0) { + store->CompletePending(false); + } else if(idx % kRefreshInterval == 0) { + store->Refresh(); + } + } + + // Make sure operations are completed + store->CompletePending(true); + + // Deregister thread from FASTER + store->StopSession(); + + printf("Populate successful on thread %" PRIu32 ".\n", Thread::id()); + } + + public: + void Populate() { + std::deque threads; + for(size_t idx = 0; idx < num_threads; ++idx) { + threads.emplace_back(&PopulateWorker, &store, idx, &num_active_threads, num_threads, + &num_checkpoints); + } + for(auto& thread : threads) { + thread.join(); + } + // Verify the records. + auto callback = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + assert(result == Status::Ok); + }; + // Create array for reading + auto read_results = alloc_aligned(64, sizeof(uint64_t) * kNumUniqueKeys); + std::memset(read_results.get(), 0, sizeof(uint64_t) * kNumUniqueKeys); + + // Register with thread + store.StartSession(); + + // Issue read requests + for(uint64_t idx = 0; idx < kNumUniqueKeys; ++idx) { + ReadContext context{ AdId{ idx }, read_results.get() + idx }; + store.Read(context, callback, idx); + } + + // Complete all pending requests + store.CompletePending(true); + + // Release + store.StopSession(); + for(uint64_t idx = 0; idx < kNumUniqueKeys; ++idx) { + uint64_t expected_result = (num_threads * kNumOps) / kNumUniqueKeys; + if(read_results.get()[idx] != expected_result) { + printf("Debug error for AdId %" PRIu64 ": Expected (%" PRIu64 "), Found(%" PRIu64 ")\n", + idx, + expected_result, + read_results.get()[idx]); + } + } + } + + void RecoverAndTest(uint32_t cpr_version, uint32_t index_version) { + auto callback = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + assert(result == Status::Ok); + }; + + // Recover + std::vector session_ids; + FASTER::core::Status result = store.Recover(cpr_version, index_version, session_ids); + if(result != FASTER::core::Status::Ok) { + printf("Recovery failed with error %u\n", static_cast(result)); + exit(1); + } + + std::vector serial_nums; + for(const auto& session_id : session_ids) { + serial_nums.push_back(store.ContinueSession(session_id)); + store.StopSession(); + } + + // Create array for reading + auto read_results = alloc_aligned(64, sizeof(uint64_t) * kNumUniqueKeys); + std::memset(read_results.get(), 0, sizeof(uint64_t) * kNumUniqueKeys); + + // Register with thread + store.StartSession(); + + // Issue read requests + for(uint64_t idx = 0; idx < kNumUniqueKeys; ++idx) { + ReadContext context{ AdId{ idx}, read_results.get() + idx }; + store.Read(context, callback, idx); + } + + // Complete all pending requests + store.CompletePending(true); + + // Release + store.StopSession(); + + // Test outputs + // Compute expected array + auto expected_results = alloc_aligned(64, + sizeof(uint64_t) * kNumUniqueKeys); + std::memset(expected_results.get(), 0, sizeof(uint64_t) * kNumUniqueKeys); + + // Sessions that were active during checkpoint: + for(uint64_t serial_num : serial_nums) { + for(uint64_t idx = 0; idx <= serial_num; ++idx) { + ++expected_results.get()[idx % kNumUniqueKeys]; + } + } + // Sessions that were finished at time of checkpoint. + size_t num_completed = num_threads - serial_nums.size(); + for(size_t thread_idx = 0; thread_idx < num_completed; ++thread_idx) { + uint64_t serial_num = kNumOps; + for(uint64_t idx = 0; idx < serial_num; ++idx) { + ++expected_results.get()[idx % kNumUniqueKeys]; + } + } + + // Assert if expected is same as found + for(uint64_t idx = 0; idx < kNumUniqueKeys; ++idx) { + if(expected_results.get()[idx] != read_results.get()[idx]) { + printf("Debug error for AdId %" PRIu64 ": Expected (%" PRIu64 "), Found(%" PRIu64 ")\n", + idx, + expected_results.get()[idx], + read_results.get()[idx]); + } + } + printf("Test successful\n"); + } + + static void ContinueWorker(store_t* store, size_t thread_idx, + std::atomic* num_active_threads, size_t num_threads, + std::atomic* num_checkpoints, Guid guid) { + auto callback = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + assert(result == Status::Ok); + }; + + auto persistence_callback = [](uint64_t persistent_serial_num) { + printf("Thread %" PRIu32 " reports persistence until %" PRIu64 "\n", + Thread::id(), persistent_serial_num); + }; + + // Register thread with the store + uint64_t start_num = store->ContinueSession(guid); + + ++(*num_active_threads); + + // Process the batch of input data + for(size_t idx = start_num + 1; idx < kNumOps; ++idx) { + RmwContext context{ idx % kNumUniqueKeys, 1 }; + store->Rmw(context, callback, idx); + if(idx % kCheckpointInterval == 0 && *num_active_threads == num_threads) { + if(store->Checkpoint(persistence_callback)) { + printf("Thread %" PRIu32 " calling Checkpoint(), %" PRIu32 "\n", Thread::id(), + ++(*num_checkpoints)); + } + } + if(idx % kCompletePendingInterval == 0) { + store->CompletePending(false); + } else if(idx % kRefreshInterval == 0) { + store->Refresh(); + } + } + + // Make sure operations are completed + store->CompletePending(true); + + // Deregister thread from FASTER + store->StopSession(); + + printf("Populate successful on thread %" PRIu32 ".\n", Thread::id()); + } + + void Continue(uint32_t cpr_version, uint32_t index_version) { + // Recover + printf("Recovering version (%" PRIu32 ", %" PRIu32 ")\n", cpr_version, index_version); + std::vector session_ids; + FASTER::core::Status result = store.Recover(cpr_version, index_version, session_ids); + if(result != FASTER::core::Status::Ok) { + printf("Recovery failed with error %u\n", static_cast(result)); + exit(1); + } else { + printf("Recovery Done!\n"); + } + + num_checkpoints.store(cpr_version); + // Some threads may have already completed. + num_threads = session_ids.size(); + + std::deque threads; + for(size_t idx = 0; idx < num_threads; ++idx) { + threads.emplace_back(&ContinueWorker, &store, idx, &num_active_threads, num_threads, + &num_checkpoints, session_ids[idx]); + } + for(auto& thread : threads) { + thread.join(); + } + } + + store_t& store; + size_t num_threads; + std::atomic num_active_threads; + std::atomic num_checkpoints; +}; + +} // namespace sum_store diff --git a/cc/playground/sum_store-dir/single_threaded_recovery_test.h b/cc/playground/sum_store-dir/single_threaded_recovery_test.h new file mode 100644 index 000000000..cf9a642b9 --- /dev/null +++ b/cc/playground/sum_store-dir/single_threaded_recovery_test.h @@ -0,0 +1,140 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "core/auto_ptr.h" +#include "core/faster.h" +#include "sum_store.h" + +using namespace FASTER; + +namespace sum_store { + +class SingleThreadedRecoveryTest { + public: + static constexpr uint64_t kNumUniqueKeys = (1L << 23); + static constexpr uint64_t kNumOps = (1L << 25); + static constexpr uint64_t kRefreshInterval = (1L << 8); + static constexpr uint64_t kCompletePendingInterval = (1L << 12); + static constexpr uint64_t kCheckpointInterval = (1L << 20); + + SingleThreadedRecoveryTest(store_t& store_) + : store{ store_ } { + } + + private: + + public: + void Populate() { + auto callback = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + assert(result == Status::Ok); + }; + + auto persistence_callback = [](uint64_t persistent_serial_num) { + printf("Thread %" PRIu32 " reports persistence until %" PRIu64 "\n", + Thread::id(), persistent_serial_num); + }; + + // Register thread with FASTER + store.StartSession(); + + // Process the batch of input data + for(uint64_t idx = 0; idx < kNumOps; ++idx) { + RmwContext context{ AdId{ idx % kNumUniqueKeys}, 1 }; + store.Rmw(context, callback, idx); + + if(idx % kCheckpointInterval == 0) { + store.Checkpoint(persistence_callback); + } + if(idx % kCompletePendingInterval == 0) { + store.CompletePending(false); + } else if(idx % kRefreshInterval == 0) { + store.Refresh(); + } + } + // Make sure operations are completed + store.CompletePending(true); + + // Deregister thread from FASTER + store.StopSession(); + + printf("Populate successful\n"); + + std::string discard; + std::getline(std::cin, discard); + } + + void RecoverAndTest(uint32_t cpr_version, uint32_t index_version) { + auto callback = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + assert(result == Status::Ok); + }; + + // Recover + std::vector session_ids; + store.Recover(cpr_version, index_version, session_ids); + + // Create array for reading + auto read_results = alloc_aligned(64, sizeof(uint64_t) * kNumUniqueKeys); + std::memset(read_results.get(), 0, sizeof(uint64_t) * kNumUniqueKeys); + + Guid session_id = session_ids[0]; + + // Register with thread + uint64_t sno = store.ContinueSession(session_id); + + // Issue read requests + for(uint64_t idx = 0; idx < kNumUniqueKeys; ++idx) { + ReadContext context{ AdId{ idx}, read_results.get() + idx }; + store.Read(context, callback, idx); + } + + // Complete all pending requests + store.CompletePending(true); + + // Release + store.StopSession(); + + // Test outputs + // Compute expected array + auto expected_results = alloc_aligned(64, + sizeof(uint64_t) * kNumUniqueKeys); + std::memset(expected_results.get(), 0, sizeof(uint64_t) * kNumUniqueKeys); + + for(uint64_t idx = 0; idx <= sno; ++idx) { + ++expected_results.get()[idx % kNumUniqueKeys]; + } + + // Assert if expected is same as found + for(uint64_t idx = 0; idx < kNumUniqueKeys; ++idx) { + if(expected_results.get()[idx] != read_results.get()[idx]) { + printf("Debug error for AdId %" PRIu64 ": Expected (%" PRIu64 "), Found(%" PRIu64 ")\n", + idx, + expected_results.get()[idx], + read_results.get()[idx]); + } + } + printf("Test successful\n"); + + std::string discard; + std::getline(std::cin, discard); + } + + void Continue() { + // Not implemented. + assert(false); + } + + store_t& store; +}; + +} // namespace sum_store diff --git a/cc/playground/sum_store-dir/sum_store.cc b/cc/playground/sum_store-dir/sum_store.cc new file mode 100644 index 000000000..f42edfd90 --- /dev/null +++ b/cc/playground/sum_store-dir/sum_store.cc @@ -0,0 +1,77 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#include +#include +#include +#include +#include + +#include "concurrent_recovery_test.h" +#include "sum_store.h" +#include "single_threaded_recovery_test.h" + +int main(int argc, char* argv[]) { + if(argc < 3) { + printf("Usage: sum_store.exe single \n"); + printf("Where is one of \"populate\", \"recover \", or \"continue\".\n"); + exit(0); + } + + std::experimental::filesystem::create_directory("sum_storage"); + + static constexpr uint64_t kKeySpace = (1L << 15); + + sum_store::store_t store{ kKeySpace, 17179869184, "sum_storage" }; + + + std::string type{ argv[1] }; + if(type == "single") { + sum_store::SingleThreadedRecoveryTest test{ store }; + + std::string task{ argv[2] }; + if(task == "populate") { + test.Populate(); + } else if(task == "recover") { + if(argc != 4) { + printf("Must specify version to recover to.\n"); + exit(1); + } + uint32_t version = std::atoi(argv[3]); + test.RecoverAndTest(version, version); + } + } else if(type == "concurrent") { + if(argc < 4) { + printf("Must specify number of threads to execute concurrently.\n"); + exit(1); + } + + size_t num_threads = std::atoi(argv[2]); + + sum_store::ConcurrentRecoveryTest test{ store, num_threads }; + + std::string task{ argv[3] }; + if(task == "populate") { + test.Populate(); + } else if(task == "recover") { + if(argc != 5) { + printf("Must specify version to recover to.\n"); + exit(1); + } + uint32_t version = std::atoi(argv[4]); + test.RecoverAndTest(version, version); + } else if(task == "continue") { + if(argc != 5) { + printf("Must specify version to continue from.\n"); + exit(1); + } + uint32_t version = std::atoi(argv[4]); + test.Continue(version, version); + } + + } + + + return 0; +} + diff --git a/cc/playground/sum_store-dir/sum_store.h b/cc/playground/sum_store-dir/sum_store.h new file mode 100644 index 000000000..38c4861b6 --- /dev/null +++ b/cc/playground/sum_store-dir/sum_store.h @@ -0,0 +1,160 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include +#include +#include +#include "core/faster.h" +#include "core/utility.h" +#include "device/file_system_disk.h" + +using namespace FASTER::core; + +namespace sum_store { + +// Sum store's key type. +class AdId { + public: + AdId(uint64_t key) + : key_{ key } { + } + + inline static constexpr uint32_t size() { + return static_cast(sizeof(AdId)); + } + inline KeyHash GetHash() const { + return KeyHash{ Utility::GetHashCode(key_) }; + } + + /// Comparison operators. + inline bool operator==(const AdId& other) const { + return key_ == other.key_; + } + inline bool operator!=(const AdId& other) const { + return key_ != other.key_; + } + + private: + uint64_t key_; +}; +static_assert(sizeof(AdId) == 8, "sizeof(AdId) != 8)"); + +// Sum store's value type. +class NumClicks { + public: + NumClicks() + : num_clicks{ 0 } { + } + NumClicks(const NumClicks& other) + : num_clicks{ other.num_clicks } { + } + + inline static constexpr uint32_t size() { + return static_cast(sizeof(NumClicks)); + } + + union { + uint64_t num_clicks; + std::atomic atomic_num_clicks; + }; +}; + +/// Key is an 8-byte advertising ID. +typedef AdId key_t; + +/// Value is an 8-byte count of clicks. +typedef NumClicks value_t; + +/// Context to update the sum store (via read-modify-write). +class RmwContext : public IAsyncContext { + public: + typedef sum_store::key_t key_t; + typedef sum_store::value_t value_t; + + RmwContext(const AdId& key, uint64_t increment) + : key_{ key } + , increment_{ increment } { + } + + /// Copy (and deep-copy) constructor. + RmwContext(const RmwContext& other) + : key_{ other.key_ } + , increment_{ other.increment_ } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const AdId& key() const { + return key_; + } + + inline void RmwInitial(NumClicks& value) { + value.num_clicks = increment_; + } + inline void RmwCopy(const NumClicks& old_value, NumClicks& value) { + value.num_clicks = old_value.num_clicks + increment_; + } + inline bool RmwAtomic(NumClicks& value) { + value.atomic_num_clicks.fetch_add(increment_); + return true; + } + inline static constexpr uint32_t value_size() { + return sizeof(value_t); + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + AdId key_; + uint64_t increment_; +}; + +/// Context to read the store (after recovery). +class ReadContext : public IAsyncContext { + public: + typedef sum_store::key_t key_t; + typedef sum_store::value_t value_t; + + ReadContext(const AdId& key, uint64_t* result) + : key_{ key } + , result_{ result } { + } + + /// Copy (and deep-copy) constructor. + ReadContext(const ReadContext& other) + : key_{ other.key_ } + , result_{ other.result_ } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const AdId& key() const { + return key_; + } + + inline void Get(const value_t& value) { + *result_ = value.num_clicks; + } + inline void GetAtomic(const value_t& value) { + *result_ = value.atomic_num_clicks; + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + AdId key_; + uint64_t* result_; +}; + +typedef FasterKv> store_t; + +} // namespace sum_store diff --git a/cc/src/CMakeLists.txt b/cc/src/CMakeLists.txt new file mode 100644 index 000000000..4ea429cc2 --- /dev/null +++ b/cc/src/CMakeLists.txt @@ -0,0 +1,63 @@ +# Build the FASTER library. +set (FAST_HEADERS + core/address.h + core/alloc.h + core/async.h + core/async_result_types.h + core/auto_ptr.h + core/checkpoint_locks.h + core/checkpoint_state.h + core/constants.h + core/faster.h + core/gc_state.h + core/grow_state.h + core/guid.h + core/hash_bucket.h + core/hash_table.h + core/internal_contexts.h + core/key_hash.h + core/light_epoch.h + core/lss_allocator.h + core/malloc_fixed_page_size.h + core/native_buffer_pool.h + core/persistent_memory_malloc.h + core/phase.h + core/record.h + core/recovery_status.h + core/state_transitions.h + core/status.h + core/thread.h + core/utility.h + device/file_system_disk.h + device/null_disk.h + environment/file.h + environment/file_common.h +) + +if (MSVC) +set (FAST_HEADERS ${FAST_HEADERS} + environment/file_windows.h +) +else() +set (FAST_HEADERS ${FAST_HEADERS} + environment/file_linux.h +) +endif() + +set (FAST_SOURCES + core/address.cc + core/lss_allocator.cc + core/thread.cc +) + +if (MSVC) +set (FAST_SOURCES ${FAST_SOURCES} + environment/file_windows.cc +) +else() +set (FAST_SOURCES ${FAST_SOURCES} + environment/file_linux.cc +) +endif() + +add_library(faster STATIC ${FAST_SOURCES} ${FAST_HEADERS}) diff --git a/cc/src/core/address.cc b/cc/src/core/address.cc new file mode 100644 index 000000000..1d9e289af --- /dev/null +++ b/cc/src/core/address.cc @@ -0,0 +1,12 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#include "address.h" + +namespace FASTER { +namespace core { + +constexpr uint32_t Address::kMaxOffset; + +} +} // namespace FASTER::core diff --git a/cc/src/core/address.h b/cc/src/core/address.h new file mode 100644 index 000000000..aeb52d3fb --- /dev/null +++ b/cc/src/core/address.h @@ -0,0 +1,177 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include +#include +#include +#include + +namespace FASTER { +namespace core { + +class PageOffset; + +/// (Logical) address into persistent memory. Identifies a page and an offset within that page. +/// Uses 48 bits: 25 bits for the offset and 23 bits for the page. (The remaining 16 bits are +/// reserved for use by the hash table.) +/// Address +class Address { + public: + friend class PageOffset; + + /// An invalid address, used when you need to initialize an address but you don't have a valid + /// value for it yet. NOTE: set to 1, not 0, to distinguish an invalid hash bucket entry + /// (initialized to all zeros) from a valid hash bucket entry that points to an invalid address. + static constexpr uint64_t kInvalidAddress = 1; + + /// A logical address is 8 bytes. + /// --of which 48 bits are used for the address. (The remaining 16 bits are used by the hash + /// table, for control bits and the tag.) + static constexpr uint64_t kAddressBits = 48; + static constexpr uint64_t kMaxAddress = ((uint64_t)1 << kAddressBits) - 1; + /// --of which 25 bits are used for offsets into a page, of size 2^25 = 32 MB. + static constexpr uint64_t kOffsetBits = 25; + static constexpr uint32_t kMaxOffset = ((uint32_t)1 << kOffsetBits) - 1; + /// --and the remaining 23 bits are used for the page index, allowing for approximately 8 million + /// pages. + static constexpr uint64_t kPageBits = kAddressBits - kOffsetBits; + static constexpr uint32_t kMaxPage = ((uint32_t)1 << kPageBits) - 1; + + /// Default constructor. + Address() + : control_{ 0 } { + } + Address(uint32_t page, uint32_t offset) + : reserved_{ 0 } + , page_{ page } + , offset_{ offset } { + } + /// Copy constructor. + Address(const Address& other) + : control_{ other.control_ } { + } + Address(uint64_t control) + : control_{ control } { + assert(reserved_ == 0); + } + + inline Address& operator=(const Address& other) { + control_ = other.control_; + return *this; + } + inline Address& operator+=(uint64_t delta) { + assert(delta < UINT32_MAX); + control_ += delta; + return *this; + } + inline Address operator-(const Address& other) { + return control_ - other.control_; + } + + /// Comparison operators. + inline bool operator<(const Address& other) const { + assert(reserved_ == 0); + assert(other.reserved_ == 0); + return control_ < other.control_; + } + inline bool operator<=(const Address& other) const { + assert(reserved_ == 0); + assert(other.reserved_ == 0); + return control_ <= other.control_; + } + inline bool operator>(const Address& other) const { + assert(reserved_ == 0); + assert(other.reserved_ == 0); + return control_ > other.control_; + } + inline bool operator>=(const Address& other) const { + assert(reserved_ == 0); + assert(other.reserved_ == 0); + return control_ >= other.control_; + } + inline bool operator==(const Address& other) const { + return control_ == other.control_; + } + inline bool operator!=(const Address& other) const { + return control_ != other.control_; + } + + /// Accessors. + inline uint32_t page() const { + return static_cast(page_); + } + inline uint32_t offset() const { + return static_cast(offset_); + } + inline uint64_t control() const { + return control_; + } + + private: + union { + struct { + uint64_t offset_ : kOffsetBits; // 25 bits + uint64_t page_ : kPageBits; // 23 bits + uint64_t reserved_ : 64 - kAddressBits; // 16 bits + }; + uint64_t control_; + }; +}; +static_assert(sizeof(Address) == 8, "sizeof(Address) != 8"); + +} +} // namespace FASTER::core + +/// Implement std::min() for Address type. +namespace std { +template <> +inline const FASTER::core::Address& min(const FASTER::core::Address& a, + const FASTER::core::Address& b) { + return (b < a) ? b : a; +} +} + +namespace FASTER { +namespace core { + +/// Atomic (logical) address. +class AtomicAddress { + public: + AtomicAddress(const Address& address) + : control_{ address.control() } { + } + + /// Atomic access. + inline Address load() const { + return Address{ control_.load() }; + } + inline void store(Address value) { + control_.store(value.control()); + } + inline bool compare_exchange_strong(Address& expected, Address desired) { + uint64_t expected_control = expected.control(); + bool result = control_.compare_exchange_strong(expected_control, desired.control()); + expected = Address{ expected_control }; + return result; + } + + /// Accessors. + inline uint32_t page() const { + return load().page(); + } + inline uint32_t offset() const { + return load().offset(); + } + inline uint64_t control() const { + return load().control(); + } + + private: + /// Atomic access to the address. + std::atomic control_; +}; + +} +} // namespace FASTER::core diff --git a/cc/src/core/alloc.h b/cc/src/core/alloc.h new file mode 100644 index 000000000..fefe5e806 --- /dev/null +++ b/cc/src/core/alloc.h @@ -0,0 +1,35 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include + +#ifdef _WIN32 +#include +#endif + +namespace FASTER { +namespace core { + +/// Windows and standard C++/Linux have incompatible implementations of aligned malloc(). (Windows +/// defines a corresponding aligned free(), while Linux relies on the ordinary free().) +inline void* aligned_alloc(size_t alignment, size_t size) { +#ifdef _WIN32 + return _aligned_malloc(size, alignment); +#else + return ::aligned_alloc(alignment, size); +#endif +} + +inline void aligned_free(void* ptr) { +#ifdef _WIN32 + _aligned_free(ptr); +#else + ::free(ptr); +#endif +} + +} +} // namespace FASTER::core + diff --git a/cc/src/core/async.h b/cc/src/core/async.h new file mode 100644 index 000000000..ce1786d82 --- /dev/null +++ b/cc/src/core/async.h @@ -0,0 +1,132 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include +#include +#include +#include + +#include "auto_ptr.h" +#include "status.h" + +namespace FASTER { +namespace core { + +#define RETURN_NOT_OK(s) do { \ + Status _s = (s); \ + if (_s != Status::Ok) return _s; \ + } while (0) + +class IAsyncContext; + +/// Signature of the async callback for I/Os. +typedef void(*AsyncIOCallback)(IAsyncContext* context, Status result, size_t bytes_transferred); + +/// Standard interface for contexts used by async callbacks. +class IAsyncContext { + public: + IAsyncContext() + : from_deep_copy_{ false } { + } + + virtual ~IAsyncContext() { } + + /// Contexts are initially allocated (as local variables) on the stack. When an operation goes + /// async, it deep copies its context to a new heap allocation; this context must also deep copy + /// its parent context, if any. Once a context has been deep copied, subsequent DeepCopy() calls + /// just return the original, heap-allocated copy. + Status DeepCopy(IAsyncContext*& context_copy) { + if(from_deep_copy_) { + // Already on the heap: nothing to do. + context_copy = this; + return Status::Ok; + } else { + RETURN_NOT_OK(DeepCopy_Internal(context_copy)); + context_copy->from_deep_copy_ = true; + return Status::Ok; + } + } + + /// Whether the internal state for the async context has been copied to a heap-allocated memory + /// block. + bool from_deep_copy() const { + return from_deep_copy_; + } + + protected: + /// Override this method to make a deep, persistent copy of your context. A context should: + /// 1. Allocate memory for its copy. If the allocation fails, return Status::OutOfMemory. + /// 2. If it has a parent/caller context, call DeepCopy() on that context. If the call fails, + /// free the memory it just allocated and return the call's error code. + /// 3. Initialize its copy and return Status::Ok.. + virtual Status DeepCopy_Internal(IAsyncContext*& context_copy) = 0; + + /// A common pattern: deep copy, when context has no parent/caller context. + template + inline static Status DeepCopy_Internal(C& context, IAsyncContext*& context_copy) { + context_copy = nullptr; + auto ctxt = alloc_context(sizeof(C)); + if(!ctxt.get()) return Status::OutOfMemory; + new(ctxt.get()) C{ context }; + context_copy = ctxt.release(); + return Status::Ok; + } + /// Another common pattern: deep copy, when context has a parent/caller context. + template + inline static Status DeepCopy_Internal(C& context, IAsyncContext* caller_context, + IAsyncContext*& context_copy) { + context_copy = nullptr; + auto ctxt = alloc_context(sizeof(C)); + if(!ctxt.get()) return Status::OutOfMemory; + IAsyncContext* caller_context_copy; + RETURN_NOT_OK(caller_context->DeepCopy(caller_context_copy)); + new(ctxt.get()) C{ context, caller_context_copy }; + context_copy = ctxt.release(); + return Status::Ok; + } + + private: + /// Whether the internal state for the async context has been copied to a heap-allocated memory + /// block. + bool from_deep_copy_; +}; + +/// User-defined callbacks for async FASTER operations. Async callback equivalent of: +/// Status some_function(context* arg). +typedef void(*AsyncCallback)(IAsyncContext* ctxt, Status result); + +/// Helper class, for use inside a continuation callback, that ensures the context will be freed +/// when the callback exits. +template +class CallbackContext { + public: + CallbackContext(IAsyncContext* context) + : async{ false } { + context_ = make_context_unique_ptr(static_cast(context)); + } + + ~CallbackContext() { + if(async || !context_->from_deep_copy()) { + // The callback went async again, or it never went async. The next callback or the caller is + // responsible for freeing the context. + context_.release(); + } + } + + C* get() const { + return context_.get(); + } + C* operator->() const { + return context_.get(); + } + + public: + bool async; + protected: + context_unique_ptr_t context_; +}; + +} +} // namespace FASTER::core \ No newline at end of file diff --git a/cc/src/core/async_result_types.h b/cc/src/core/async_result_types.h new file mode 100644 index 000000000..698bdedac --- /dev/null +++ b/cc/src/core/async_result_types.h @@ -0,0 +1,60 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include +#include +#include "address.h" +#include "async.h" +#include "native_buffer_pool.h" + +#ifdef _WIN32 +#include + +template +using concurrent_queue = concurrency::concurrent_queue; +#endif + +namespace FASTER { +namespace core { + +class AsyncIOContext : public IAsyncContext { + public: + AsyncIOContext(void* faster_, Address address_, + IAsyncContext* caller_context_, + concurrent_queue* thread_io_responses_, + uint64_t io_id_) + : faster{ faster_ } + , address{ address_ } + , caller_context{ caller_context_ } + , thread_io_responses{ thread_io_responses_ } + , io_id{ io_id_ } { + } + /// No copy constructor. + AsyncIOContext(const AsyncIOContext& other) = delete; + /// The deep-copy constructor. + AsyncIOContext(AsyncIOContext& other, IAsyncContext* caller_context_) + : faster{ other.faster } + , address{ other.address } + , caller_context{ caller_context_ } + , thread_io_responses{ other.thread_io_responses } + , record{ std::move(other.record) } + , io_id{ other.io_id } { + } + protected: + Status DeepCopy_Internal(IAsyncContext*& context_copy) final { + return IAsyncContext::DeepCopy_Internal(*this, caller_context, context_copy); + } + public: + void* faster; + Address address; + IAsyncContext* caller_context; + concurrent_queue* thread_io_responses; + uint64_t io_id; + + SectorAlignedMemory record; +}; + +} +} // namespace FASTER::core \ No newline at end of file diff --git a/cc/src/core/auto_ptr.h b/cc/src/core/auto_ptr.h new file mode 100644 index 000000000..31d9edaaf --- /dev/null +++ b/cc/src/core/auto_ptr.h @@ -0,0 +1,123 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include +#include +#include +#include +#include + +#include "alloc.h" +#include "lss_allocator.h" + +#ifdef _WIN32 +#include +#pragma intrinsic(_BitScanReverse64) +#else +namespace FASTER { +/// Convert GCC's __builtin_clzl() to Microsoft's _BitScanReverse64(). +inline uint8_t _BitScanReverse64(unsigned long* index, uint64_t mask) { + bool found = mask > 0; + *index = 63 - __builtin_clzl(mask); + return found; +} +} +#endif + +/// Wrappers for C++ std::unique_ptr<>. + +namespace FASTER { +namespace core { + +/// Round the specified size up to the next power of 2. +inline size_t next_power_of_two(size_t size) { + assert(size > 0); + // BSR returns the index k of the most-significant 1 bit. So 2^(k+1) > (size - 1) >= 2^k, + // which means 2^(k+1) >= size > 2^k. + unsigned long k; + uint8_t found = _BitScanReverse64(&k, size - 1); + return (uint64_t)1 << (found * (k + 1)); +} + +/// Pad alignment to specified. Declared "constexpr" so that the calculation can be performed at +/// compile time, assuming parameters "size" and "alignment" are known then. +constexpr inline size_t pad_alignment(size_t size, size_t alignment) { + assert(alignment > 0); + // Function implemented only for powers of 2. + assert((alignment & (alignment - 1)) == 0); + size_t max_padding = alignment - 1; + return (size + max_padding) & ~max_padding; +} + +/// Pad alignment to specified type. +template +constexpr inline size_t pad_alignment(size_t size) { + return pad_alignment(size, alignof(T)); +} + +/// Defined in C++ 14; copying the definition here for older compilers. +template +using remove_const_t = typename std::remove_const::type; + +/// alloc_aligned(): allocate a unique_ptr with a particular alignment. +template +void unique_ptr_aligned_deleter(T* p) { + auto q = const_cast*>(p); + q->~T(); + aligned_free(q); +} + +template +struct AlignedDeleter { + void operator()(T* p) const { + unique_ptr_aligned_deleter(p); + } +}; + +template +using aligned_unique_ptr_t = std::unique_ptr>; +static_assert(sizeof(aligned_unique_ptr_t) == 8, "sizeof(unique_aligned_ptr_t)"); + +template +aligned_unique_ptr_t make_aligned_unique_ptr(T* p) { + return aligned_unique_ptr_t(p, AlignedDeleter()); +} + +template +aligned_unique_ptr_t alloc_aligned(size_t alignment, size_t size) { + return make_aligned_unique_ptr(reinterpret_cast(aligned_alloc(alignment, size))); +} + +/// alloc_context(): allocate a small chunk of memory for a callback context. +template +void unique_ptr_context_deleter(T* p) { + auto q = const_cast*>(p); + q->~T(); + lss_allocator.Free(q); +} + +template +struct ContextDeleter { + void operator()(T* p) const { + unique_ptr_context_deleter(p); + } +}; + +template +using context_unique_ptr_t = std::unique_ptr>; +static_assert(sizeof(context_unique_ptr_t) == 8, "sizeof(context_unique_ptr_t)"); + +template +context_unique_ptr_t make_context_unique_ptr(T* p) { + return context_unique_ptr_t(p, ContextDeleter()); +} + +template +context_unique_ptr_t alloc_context(uint32_t size) { + return make_context_unique_ptr(reinterpret_cast(lss_allocator.Allocate(size))); +} + +} +} // namespace FASTER::core diff --git a/cc/src/core/checkpoint_locks.h b/cc/src/core/checkpoint_locks.h new file mode 100644 index 000000000..c1f31cd36 --- /dev/null +++ b/cc/src/core/checkpoint_locks.h @@ -0,0 +1,192 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include +#include +#include +#include + +#include "alloc.h" +#include "constants.h" +#include "key_hash.h" + +namespace FASTER { +namespace core { + +struct CheckpointLock { + CheckpointLock() + : control_{ 0 } { + } + CheckpointLock(uint64_t control) + : control_{ control } { + } + CheckpointLock(uint32_t old_lock_count, uint32_t new_lock_count) + : old_lock_count_{ old_lock_count } + , new_lock_count_{ new_lock_count } { + } + + union { + struct { + uint32_t old_lock_count_; + uint32_t new_lock_count_; + }; + uint64_t control_; + }; +}; +static_assert(sizeof(CheckpointLock) == 8, "sizeof(CheckpointLock) != 8"); + +class AtomicCheckpointLock { + public: + AtomicCheckpointLock() + : control_{ 0 } { + } + + /// Try to lock the old version of a record. + inline bool try_lock_old() { + CheckpointLock expected{ control_.load() }; + while(expected.new_lock_count_ == 0) { + CheckpointLock desired{ expected.old_lock_count_ + 1, 0 }; + if(control_.compare_exchange_strong(expected.control_, desired.control_)) { + return true; + } + } + return false; + } + inline void unlock_old() { + control_ -= CheckpointLock{ 1, 0 } .control_; + } + + /// Try to lock the new version of a record. + inline bool try_lock_new() { + CheckpointLock expected{ control_.load() }; + while(expected.old_lock_count_ == 0) { + CheckpointLock desired{ 0, expected.new_lock_count_ + 1 }; + if(control_.compare_exchange_strong(expected.control_, desired.control_)) { + return true; + } + } + return false; + } + inline void unlock_new() { + control_ -= CheckpointLock{ 0, 1 } .control_; + } + + inline bool old_locked() const { + CheckpointLock result{ control_ }; + return result.old_lock_count_ > 0; + } + inline bool new_locked() const { + CheckpointLock result{ control_ }; + return result.new_lock_count_ > 0; + } + + private: + union { + std::atomic control_; + }; +}; +static_assert(sizeof(AtomicCheckpointLock) == 8, "sizeof(AtomicCheckpointLock) != 8"); + +class CheckpointLocks { + public: + CheckpointLocks() + : size_{ 0 } + , locks_{ nullptr } { + } + + ~CheckpointLocks() { + if(locks_) { + aligned_free(locks_); + } + } + + void Initialize(uint64_t size) { + assert(size < INT32_MAX); + assert(Utility::IsPowerOfTwo(size)); + if(locks_) { + aligned_free(locks_); + } + size_ = size; + locks_ = reinterpret_cast(aligned_alloc(Constants::kCacheLineBytes, + size_ * sizeof(AtomicCheckpointLock))); + std::memset(locks_, 0, size_ * sizeof(AtomicCheckpointLock)); + } + + void Free() { + assert(locks_); +#ifdef _DEBUG + for(uint64_t idx = 0; idx < size_; ++idx) { + assert(!locks_[idx].old_locked()); + assert(!locks_[idx].new_locked()); + } +#endif + aligned_free(locks_); + size_ = 0; + locks_ = nullptr; + } + + inline uint64_t size() const { + return size_; + } + + inline AtomicCheckpointLock& get_lock(KeyHash hash) { + return locks_[hash.idx(size_)]; + } + + private: + uint64_t size_; + AtomicCheckpointLock* locks_; +}; + +class CheckpointLockGuard { + public: + CheckpointLockGuard(CheckpointLocks& locks, KeyHash hash) + : lock_{ nullptr } + , locked_old_{ false } + , locked_new_{ false } { + if(locks.size() > 0) { + lock_ = &locks.get_lock(hash); + } + } + ~CheckpointLockGuard() { + if(lock_) { + if(locked_old_) { + lock_->unlock_old(); + } + if(locked_new_) { + lock_->unlock_new(); + } + } + } + inline bool try_lock_old() { + assert(lock_); + assert(!locked_old_); + locked_old_ = lock_->try_lock_old(); + return locked_old_; + } + inline bool try_lock_new() { + assert(lock_); + assert(!locked_new_); + locked_new_ = lock_->try_lock_new(); + return locked_new_; + } + + inline bool old_locked() const { + assert(lock_); + return lock_->old_locked(); + } + inline bool new_locked() const { + assert(lock_); + return lock_->new_locked(); + } + + private: + AtomicCheckpointLock* lock_; + bool locked_old_; + bool locked_new_; +}; + +} +} // namespace FASTER::core diff --git a/cc/src/core/checkpoint_state.h b/cc/src/core/checkpoint_state.h new file mode 100644 index 000000000..44b4b2ec4 --- /dev/null +++ b/cc/src/core/checkpoint_state.h @@ -0,0 +1,166 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include +#include +#include +#include "address.h" +#include "guid.h" +#include "malloc_fixed_page_size.h" +#include "thread.h" + +namespace FASTER { +namespace core { + +/// Checkpoint metadata for the index itself. +class IndexMetadata { + public: + IndexMetadata() + : version{ 0 } + , table_size{ 0 } + , num_ht_bytes{ 0 } + , num_ofb_bytes{ 0 } + , ofb_count{ FixedPageAddress::kInvalidAddress } + , log_begin_address{ Address::kInvalidAddress } + , checkpoint_start_address{ Address::kInvalidAddress } { + } + + inline void Initialize(uint32_t version_, uint64_t size_, Address log_begin_address_, + Address checkpoint_start_address_) { + version = version_; + table_size = size_; + log_begin_address = log_begin_address_; + checkpoint_start_address = checkpoint_start_address_; + num_ht_bytes = 0; + num_ofb_bytes = 0; + ofb_count = FixedPageAddress::kInvalidAddress; + } + inline void Reset() { + version = 0; + table_size = 0; + num_ht_bytes = 0; + num_ofb_bytes = 0; + ofb_count = FixedPageAddress::kInvalidAddress; + log_begin_address = Address::kInvalidAddress; + checkpoint_start_address = Address::kInvalidAddress; + } + + uint32_t version; + uint64_t table_size; + uint64_t num_ht_bytes; + uint64_t num_ofb_bytes; + FixedPageAddress ofb_count; + /// Earliest address that is valid for the log. + Address log_begin_address; + /// Address as of which this checkpoint was taken. + Address checkpoint_start_address; +}; +static_assert(sizeof(IndexMetadata) == 56, "sizeof(IndexMetadata) != 56"); + +/// Checkpoint metadata, for the log. +class LogMetadata { + public: + LogMetadata() + : use_snapshot_file{ false } + , version{ UINT32_MAX } + , num_threads{ 0 } + , flushed_address{ Address::kInvalidAddress } + , final_address{ Address::kMaxAddress } { + std::memset(guids, 0, sizeof(guids)); + std::memset(monotonic_serial_nums, 0, sizeof(monotonic_serial_nums)); + } + + inline void Initialize(bool use_snapshot_file_, uint32_t version_, Address flushed_address_) { + use_snapshot_file = use_snapshot_file_; + version = version_; + num_threads = 0; + flushed_address = flushed_address_; + final_address = Address::kMaxAddress; + std::memset(guids, 0, sizeof(guids)); + std::memset(monotonic_serial_nums, 0, sizeof(monotonic_serial_nums)); + } + inline void Reset() { + Initialize(false, UINT32_MAX, Address::kInvalidAddress); + } + + bool use_snapshot_file; + uint32_t version; + std::atomic num_threads; + Address flushed_address; + Address final_address; + uint64_t monotonic_serial_nums[Thread::kMaxNumThreads]; + Guid guids[Thread::kMaxNumThreads]; +}; +static_assert(sizeof(LogMetadata) == 32 + (24 * Thread::kMaxNumThreads), + "sizeof(LogMetadata) != 32 + (24 * Thread::kMaxNumThreads)"); + +/// State of the active Checkpoint()/Recover() call, including metadata written to disk. +template +class CheckpointState { + public: + typedef F file_t; + typedef void(*persistence_callback_t)(uint64_t persistent_serial_num); + + CheckpointState() + : index_checkpoint_started{ false } + , failed{ false } + , flush_pending{ UINT32_MAX } + , persistence_callback{ nullptr } { + } + + void InitializeCheckpoint(uint32_t version, uint64_t table_size, Address log_begin_address, + Address checkpoint_start_address, bool use_snapshot_file, + Address flushed_until_address, + persistence_callback_t persistence_callback_) { + failed = false; + index_checkpoint_started = false; + continue_tokens.clear(); + index_metadata.Initialize(version, table_size, log_begin_address, checkpoint_start_address); + log_metadata.Initialize(use_snapshot_file, version, flushed_until_address); + if(use_snapshot_file) { + flush_pending = UINT32_MAX; + } else { + flush_pending = 0; + } + persistence_callback = persistence_callback_; + } + + void CheckpointDone() { + assert(!failed); + assert(index_checkpoint_started); + assert(continue_tokens.empty()); + assert(flush_pending == 0); + index_metadata.Reset(); + log_metadata.Reset(); + snapshot_file.Close(); + persistence_callback = nullptr; + } + + inline void InitializeRecover() { + failed = false; + } + + void RecoverDone() { + assert(!failed); + index_metadata.Reset(); + log_metadata.Reset(); + snapshot_file.Close(); + } + + std::atomic index_checkpoint_started; + std::atomic failed; + IndexMetadata index_metadata; + LogMetadata log_metadata; + /// State used when fold_over_snapshot = false. + file_t snapshot_file; + std::atomic flush_pending; + + persistence_callback_t persistence_callback; + std::unordered_map continue_tokens; +}; + +} +} // namespace FASTER::core + diff --git a/cc/src/core/constants.h b/cc/src/core/constants.h new file mode 100644 index 000000000..a1746f11b --- /dev/null +++ b/cc/src/core/constants.h @@ -0,0 +1,20 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include + +namespace FASTER { +namespace core { + +struct Constants { + /// Size of cache line in bytes + static constexpr uint32_t kCacheLineBytes = 64; + + /// We issue 256 writes to disk, to checkpoint the hash table. + static constexpr uint32_t kNumMergeChunks = 256; +}; + +} +} // namespace FASTER::cire \ No newline at end of file diff --git a/cc/src/core/faster.h b/cc/src/core/faster.h new file mode 100644 index 000000000..887dab6d4 --- /dev/null +++ b/cc/src/core/faster.h @@ -0,0 +1,2558 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "device/file_system_disk.h" + +#include "alloc.h" +#include "checkpoint_locks.h" +#include "checkpoint_state.h" +#include "constants.h" +#include "gc_state.h" +#include "grow_state.h" +#include "guid.h" +#include "hash_table.h" +#include "internal_contexts.h" +#include "key_hash.h" +#include "malloc_fixed_page_size.h" +#include "persistent_memory_malloc.h" +#include "record.h" +#include "recovery_status.h" +#include "state_transitions.h" +#include "status.h" +#include "utility.h" + +using namespace std::chrono_literals; + +/// The FASTER key-value store, and related classes. + +namespace FASTER { +namespace core { + +class alignas(Constants::kCacheLineBytes) ThreadContext { + public: + ThreadContext() + : contexts_{} + , cur_{ 0 } { + } + + inline const ExecutionContext& cur() const { + return contexts_[cur_]; + } + inline ExecutionContext& cur() { + return contexts_[cur_]; + } + + inline const ExecutionContext& prev() const { + return contexts_[(cur_ + 1) % 2]; + } + inline ExecutionContext& prev() { + return contexts_[(cur_ + 1) % 2]; + } + + inline void swap() { + cur_ = (cur_ + 1) % 2; + } + + private: + ExecutionContext contexts_[2]; + uint8_t cur_; +}; +static_assert(sizeof(ThreadContext) == 448, "sizeof(ThreadContext) != 448"); + +/// The FASTER key-value store. +template +class FasterKv { + public: + typedef FasterKv faster_t; + + /// Keys and values stored in this key-value store. + typedef K key_t; + typedef V value_t; + + typedef D disk_t; + typedef typename D::file_t file_t; + typedef typename D::log_file_t log_file_t; + + typedef PersistentMemoryMalloc hlog_t; + + /// Contexts that have been deep-copied, for async continuations, and must be accessed via + /// virtual function calls. + typedef AsyncPendingReadContext async_pending_read_context_t; + typedef AsyncPendingUpsertContext async_pending_upsert_context_t; + typedef AsyncPendingRmwContext async_pending_rmw_context_t; + + FasterKv(uint64_t table_size, uint64_t log_size, const std::string& filename, + double log_mutable_fraction = 0.9) + : min_table_size_{ table_size } + , disk{ filename, epoch_ } + , hlog{ log_size, epoch_, disk, disk.log(), log_mutable_fraction } + , system_state_{ Action::None, Phase::REST, 1 } + , num_pending_ios{ 0 } { + if(!Utility::IsPowerOfTwo(table_size)) { + throw std::invalid_argument{ " Size is not a power of 2" }; + } + if(table_size > INT32_MAX) { + throw std::invalid_argument{ " Cannot allocate such a large hash table " }; + } + + resize_info_.version = 0; + state_[0].Initialize(table_size, disk.log().alignment()); + overflow_buckets_allocator_[0].Initialize(disk.log().alignment(), epoch_); + } + + // No copy constructor. + FasterKv(const FasterKv& other) = delete; + + public: + /// Thread-related operations + Guid StartSession(); + uint64_t ContinueSession(const Guid& guid); + void StopSession(); + void Refresh(); + + /// Store interface + template + inline Status Read(RC& context, AsyncCallback callback, uint64_t monotonic_serial_num); + + template + inline Status Upsert(UC& context, AsyncCallback callback, uint64_t monotonic_serial_num); + + template + inline Status Rmw(MC& context, AsyncCallback callback, uint64_t monotonic_serial_num); + /// Delete() not yet implemented! + // void Delete(const Key& key, Context& context, uint64_t lsn); + inline bool CompletePending(bool wait = false); + + /// Checkpoint/recovery operations. + bool Checkpoint(void(*persistence_callback)(uint64_t persistent_serial_num)); + Status Recover(uint32_t cpr_version, uint32_t index_version, std::vector& session_ids); + + /// Truncating the head of the log. + bool ShiftBeginAddress(Address address, GcState::truncate_callback_t truncate_callback, + GcState::complete_callback_t complete_callback); + + /// Make the hash table larger. + bool GrowIndex(GrowState::callback_t caller_callback); + + /// Statistics + inline uint64_t Size() const { + return hlog.GetTailAddress().control(); + } + inline void DumpDistribution() { + state_[resize_info_.version].DumpDistribution( + overflow_buckets_allocator_[resize_info_.version]); + } + + private: + typedef Record record_t; + + typedef PendingContext pending_context_t; + + template + inline OperationStatus InternalRead(C& pending_context) const; + + template + inline OperationStatus InternalUpsert(C& pending_context); + + template + inline OperationStatus InternalRmw(C& pending_context, bool retrying); + + inline OperationStatus InternalRetryPendingRmw(async_pending_rmw_context_t& pending_context); + + OperationStatus InternalContinuePendingRead(ExecutionContext& ctx, + AsyncIOContext& io_context); + OperationStatus InternalContinuePendingRmw(ExecutionContext& ctx, + AsyncIOContext& io_context); + + // Find the hash bucket entry, if any, corresponding to the specified hash. + inline const AtomicHashBucketEntry* FindEntry(KeyHash hash) const; + // If a hash bucket entry corresponding to the specified hash exists, return it; otherwise, + // create a new entry. The caller can use the "expected_entry" to CAS its desired address into + // the entry. + inline AtomicHashBucketEntry* FindOrCreateEntry(KeyHash hash, HashBucketEntry& expected_entry, + HashBucket*& bucket); + inline Address TraceBackForKeyMatch(const key_t& key, Address from_address, + Address min_offset) const; + Address TraceBackForOtherChainStart(uint64_t old_size, uint64_t new_size, Address from_address, + Address min_address, uint8_t side); + + // If a hash bucket entry corresponding to the specified hash exists, return it; otherwise, + // return an unused bucket entry. + inline AtomicHashBucketEntry* FindTentativeEntry(KeyHash hash, HashBucket* bucket, + uint8_t version, HashBucketEntry& expected_entry); + // Looks for an entry that has the same + inline bool HasConflictingEntry(KeyHash hash, const HashBucket* bucket, uint8_t version, + const AtomicHashBucketEntry* atomic_entry) const; + + inline Address BlockAllocate(uint32_t record_size); + + inline Status HandleOperationStatus(ExecutionContext& ctx, + pending_context_t& pending_context, + OperationStatus internal_status, bool& async); + inline Status PivotAndRetry(ExecutionContext& ctx, pending_context_t& pending_context, + bool& async); + inline Status RetryLater(ExecutionContext& ctx, pending_context_t& pending_context, + bool& async); + inline constexpr uint32_t MinIoRequestSize() const; + inline Status IssueAsyncIoRequest(ExecutionContext& ctx, pending_context_t& pending_context, + bool& async); + + void AsyncGetFromDisk(Address address, uint32_t num_records, AsyncIOCallback callback, + AsyncIOContext& context); + static void AsyncGetFromDiskCallback(IAsyncContext* ctxt, Status result, + size_t bytes_transferred); + + void CompleteIoPendingRequests(ExecutionContext& context); + void CompleteRetryRequests(ExecutionContext& context); + + void InitializeCheckpointLocks(); + + /// Checkpoint/recovery methods. + void HandleSpecialPhases(); + bool GlobalMoveToNextState(SystemState current_state); + + Status CheckpointFuzzyIndex(); + Status CheckpointFuzzyIndexComplete(); + Status RecoverFuzzyIndex(); + Status RecoverFuzzyIndexComplete(bool wait); + + Status WriteIndexMetadata(); + Status ReadIndexMetadata(uint32_t version); + Status WriteCprMetadata(); + Status ReadCprMetadata(uint32_t version); + Status WriteCprContext(); + Status ReadCprContexts(uint32_t version, const Guid* guids); + + Status RecoverHybridLog(); + Status RecoverHybridLogFromSnapshotFile(); + Status RecoverFromPage(Address from_address, Address to_address); + Status RestoreHybridLog(); + + void MarkAllPendingRequests(); + + inline void HeavyEnter(); + bool CleanHashTableBuckets(); + void SplitHashTableBuckets(); + void AddHashEntry(HashBucket*& bucket, uint32_t& next_idx, uint8_t version, + HashBucketEntry entry); + + /// Access the current and previous (thread-local) execution contexts. + const ExecutionContext& thread_ctx() const { + return thread_contexts_[Thread::id()].cur(); + } + ExecutionContext& thread_ctx() { + return thread_contexts_[Thread::id()].cur(); + } + ExecutionContext& prev_thread_ctx() { + return thread_contexts_[Thread::id()].prev(); + } + + private: + LightEpoch epoch_; + + public: + disk_t disk; + hlog_t hlog; + + private: + static constexpr bool kCopyReadsToTail = false; + static constexpr uint64_t kGcHashTableChunkSize = 16384; + static constexpr uint64_t kGrowHashTableChunkSize = 16384; + + bool fold_over_snapshot = true; + + /// Initial size of the table + uint64_t min_table_size_; + + // Allocator for the hash buckets that don't fit in the hash table. + MallocFixedPageSize overflow_buckets_allocator_[2]; + + // An array of size two, that contains the old and new versions of the hash-table + InternalHashTable state_[2]; + + CheckpointLocks checkpoint_locks_; + + ResizeInfo resize_info_; + + AtomicSystemState system_state_; + + /// Checkpoint/recovery state. + CheckpointState checkpoint_; + /// Garbage collection state. + GcState gc_; + /// Grow (hash table) state. + GrowState grow_; + + /// Global count of pending I/Os, used for throttling. + std::atomic num_pending_ios; + + /// Space for two contexts per thread, stored inline. + ThreadContext thread_contexts_[Thread::kMaxNumThreads]; +}; + +// Implementations. +template +inline Guid FasterKv::StartSession() { + SystemState state = system_state_.load(); + if(state.phase != Phase::REST) { + throw std::runtime_error{ "Can acquire only in REST phase!" }; + } + thread_ctx().Initialize(state.phase, state.version, Guid::Create(), 0); + Refresh(); + return thread_ctx().guid; +} + +template +inline uint64_t FasterKv::ContinueSession(const Guid& session_id) { + auto iter = checkpoint_.continue_tokens.find(session_id); + if(iter == checkpoint_.continue_tokens.end()) { + throw std::invalid_argument{ "Unknown session ID" }; + } + + SystemState state = system_state_.load(); + if(state.phase != Phase::REST) { + throw std::runtime_error{ "Can continue only in REST phase!" }; + } + thread_ctx().Initialize(state.phase, state.version, session_id, iter->second); + Refresh(); + return iter->second; +} + +template +inline void FasterKv::Refresh() { + epoch_.ProtectAndDrain(); + // We check if we are in normal mode + SystemState new_state = system_state_.load(); + if(thread_ctx().phase == Phase::REST && new_state.phase == Phase::REST) { + return; + } + HandleSpecialPhases(); +} + +template +inline void FasterKv::StopSession() { + // If this thread is still involved in some activity, wait until it finishes. + while(thread_ctx().phase != Phase::REST || + !thread_ctx().pending_ios.empty() || + !thread_ctx().retry_requests.empty()) { + CompletePending(false); + std::this_thread::yield(); + } + + assert(thread_ctx().retry_requests.empty()); + assert(thread_ctx().pending_ios.empty()); + assert(thread_ctx().io_responses.empty()); + + assert(prev_thread_ctx().retry_requests.empty()); + assert(prev_thread_ctx().pending_ios.empty()); + assert(prev_thread_ctx().io_responses.empty()); + + assert(thread_ctx().phase == Phase::REST); + + epoch_.Unprotect(); +} + +template +inline const AtomicHashBucketEntry* FasterKv::FindEntry(KeyHash hash) const { + // Truncate the hash to get a bucket page_index < state[version].size. + uint32_t version = resize_info_.version; + const HashBucket* bucket = &state_[version].bucket(hash); + assert(reinterpret_cast(bucket) % Constants::kCacheLineBytes == 0); + + while(true) { + // Search through the bucket looking for our key. Last entry is reserved + // for the overflow pointer. + for(uint32_t entry_idx = 0; entry_idx < HashBucket::kNumEntries; ++entry_idx) { + HashBucketEntry entry = bucket->entries[entry_idx].load(); + if(entry.unused()) { + continue; + } + if(hash.tag() == entry.tag()) { + // Found a matching tag. (So, the input hash matches the entry on 14 tag bits + + // log_2(table size) address bits.) + if(!entry.tentative()) { + // If (final key, return immediately) + return &bucket->entries[entry_idx]; + } + } + } + + // Go to next bucket in the chain + HashBucketOverflowEntry entry = bucket->overflow_entry.load(); + if(entry.unused()) { + // No more buckets in the chain. + return nullptr; + } + bucket = &overflow_buckets_allocator_[version].Get(entry.address()); + assert(reinterpret_cast(bucket) % Constants::kCacheLineBytes == 0); + } + assert(false); + return nullptr; // NOT REACHED +} + +template +inline AtomicHashBucketEntry* FasterKv::FindTentativeEntry(KeyHash hash, + HashBucket* bucket, + uint8_t version, HashBucketEntry& expected_entry) { + expected_entry = HashBucketEntry::kInvalidEntry; + AtomicHashBucketEntry* atomic_entry = nullptr; + // Try to find a slot that contains the right tag or that's free. + while(true) { + // Search through the bucket looking for our key. Last entry is reserved + // for the overflow pointer. + for(uint32_t entry_idx = 0; entry_idx < HashBucket::kNumEntries; ++entry_idx) { + HashBucketEntry entry = bucket->entries[entry_idx].load(); + if(entry.unused()) { + if(!atomic_entry) { + // Found a free slot; keep track of it, and continue looking for a match. + atomic_entry = &bucket->entries[entry_idx]; + } + continue; + } + if(hash.tag() == entry.tag() && !entry.tentative()) { + // Found a match. (So, the input hash matches the entry on 14 tag bits + + // log_2(table size) address bits.) Return it to caller. + expected_entry = entry; + return &bucket->entries[entry_idx]; + } + } + // Go to next bucket in the chain + HashBucketOverflowEntry overflow_entry = bucket->overflow_entry.load(); + if(overflow_entry.unused()) { + // No more buckets in the chain. + if(atomic_entry) { + // We found a free slot earlier (possibly inside an earlier bucket). + assert(expected_entry == HashBucketEntry::kInvalidEntry); + return atomic_entry; + } + // We didn't find any free slots, so allocate new bucket. + FixedPageAddress new_bucket_addr = overflow_buckets_allocator_[version].Allocate(); + bool success; + do { + HashBucketOverflowEntry new_bucket_entry{ new_bucket_addr }; + success = bucket->overflow_entry.compare_exchange_strong(overflow_entry, + new_bucket_entry); + } while(!success && overflow_entry.unused()); + if(!success) { + // Install failed, undo allocation; use the winner's entry + overflow_buckets_allocator_[version].FreeAtEpoch(new_bucket_addr, 0); + } else { + // Install succeeded; we have a new bucket on the chain. Return its first slot. + bucket = &overflow_buckets_allocator_[version].Get(new_bucket_addr); + assert(expected_entry == HashBucketEntry::kInvalidEntry); + return &bucket->entries[0]; + } + } + // Go to the next bucket. + bucket = &overflow_buckets_allocator_[version].Get(overflow_entry.address()); + assert(reinterpret_cast(bucket) % Constants::kCacheLineBytes == 0); + } + assert(false); + return nullptr; // NOT REACHED +} + +template +bool FasterKv::HasConflictingEntry(KeyHash hash, const HashBucket* bucket, uint8_t version, + const AtomicHashBucketEntry* atomic_entry) const { + uint16_t tag = atomic_entry->load().tag(); + while(true) { + for(uint32_t entry_idx = 0; entry_idx < HashBucket::kNumEntries; ++entry_idx) { + HashBucketEntry entry = bucket->entries[entry_idx].load(); + if(entry != HashBucketEntry::kInvalidEntry && + entry.tag() == tag && + atomic_entry != &bucket->entries[entry_idx]) { + // Found a conflict. + return true; + } + } + // Go to next bucket in the chain + HashBucketOverflowEntry entry = bucket->overflow_entry.load(); + if(entry.unused()) { + // Reached the end of the bucket chain; no conflicts found. + return false; + } + // Go to the next bucket. + bucket = &overflow_buckets_allocator_[version].Get(entry.address()); + assert(reinterpret_cast(bucket) % Constants::kCacheLineBytes == 0); + } +} + +template +inline AtomicHashBucketEntry* FasterKv::FindOrCreateEntry(KeyHash hash, + HashBucketEntry& expected_entry, HashBucket*& bucket) { + bucket = nullptr; + // Truncate the hash to get a bucket page_index < state[version].size. + uint32_t version = resize_info_.version; + assert(version <= 1); + + while(true) { + bucket = &state_[version].bucket(hash); + assert(reinterpret_cast(bucket) % Constants::kCacheLineBytes == 0); + + AtomicHashBucketEntry* atomic_entry = FindTentativeEntry(hash, bucket, version, + expected_entry); + if(expected_entry != HashBucketEntry::kInvalidEntry) { + // Found an existing hash bucket entry; nothing further to check. + return atomic_entry; + } + // We have a free slot. + assert(atomic_entry); + assert(expected_entry == HashBucketEntry::kInvalidEntry); + // Try to install tentative tag in free slot. + HashBucketEntry entry{ Address::kInvalidAddress, hash.tag(), true }; + if(atomic_entry->compare_exchange_strong(expected_entry, entry)) { + // See if some other thread is also trying to install this tag. + if(HasConflictingEntry(hash, bucket, version, atomic_entry)) { + // Back off and try again. + atomic_entry->store(HashBucketEntry::kInvalidEntry); + } else { + // No other thread was trying to install this tag, so we can clear our entry's "tentative" + // bit. + expected_entry = HashBucketEntry{ Address::kInvalidAddress, hash.tag(), false }; + atomic_entry->store(expected_entry); + return atomic_entry; + } + } + } + assert(false); + return nullptr; // NOT REACHED +} + +template +template +inline Status FasterKv::Read(RC& context, AsyncCallback callback, + uint64_t monotonic_serial_num) { + typedef RC read_context_t; + typedef PendingReadContext pending_read_context_t; + static_assert(std::is_base_of::value, + "value_t is not a base class of read_context_t::value_t"); + static_assert(alignof(value_t) == alignof(typename read_context_t::value_t), + "alignof(value_t) != alignof(typename read_context_t::value_t)"); + + pending_read_context_t pending_context{ context, callback }; + OperationStatus internal_status = InternalRead(pending_context); + Status status; + if(internal_status == OperationStatus::SUCCESS) { + status = Status::Ok; + } else if(internal_status == OperationStatus::NOT_FOUND) { + status = Status::NotFound; + } else { + assert(internal_status == OperationStatus::RECORD_ON_DISK); + bool async; + status = HandleOperationStatus(thread_ctx(), pending_context, internal_status, async); + } + thread_ctx().serial_num = monotonic_serial_num; + return status; +} + +template +template +inline Status FasterKv::Upsert(UC& context, AsyncCallback callback, + uint64_t monotonic_serial_num) { + typedef UC upsert_context_t; + typedef PendingUpsertContext pending_upsert_context_t; + static_assert(std::is_base_of::value, + "value_t is not a base class of upsert_context_t::value_t"); + static_assert(alignof(value_t) == alignof(typename upsert_context_t::value_t), + "alignof(value_t) != alignof(typename upsert_context_t::value_t)"); + + pending_upsert_context_t pending_context{ context, callback }; + OperationStatus internal_status = InternalUpsert(pending_context); + Status status; + + if(internal_status == OperationStatus::SUCCESS) { + status = Status::Ok; + } else { + bool async; + status = HandleOperationStatus(thread_ctx(), pending_context, internal_status, async); + } + thread_ctx().serial_num = monotonic_serial_num; + return status; +} + +template +template +inline Status FasterKv::Rmw(MC& context, AsyncCallback callback, + uint64_t monotonic_serial_num) { + typedef MC rmw_context_t; + typedef PendingRmwContext pending_rmw_context_t; + static_assert(std::is_base_of::value, + "value_t is not a base class of rmw_context_t::value_t"); + static_assert(alignof(value_t) == alignof(typename rmw_context_t::value_t), + "alignof(value_t) != alignof(typename rmw_context_t::value_t)"); + + pending_rmw_context_t pending_context{ context, callback }; + OperationStatus internal_status = InternalRmw(pending_context, false); + Status status; + if(internal_status == OperationStatus::SUCCESS) { + status = Status::Ok; + } else { + bool async; + status = HandleOperationStatus(thread_ctx(), pending_context, internal_status, async); + } + thread_ctx().serial_num = monotonic_serial_num; + return status; +} + +template +inline bool FasterKv::CompletePending(bool wait) { + do { + disk.TryComplete(); + + bool done = true; + if(thread_ctx().phase != Phase::WAIT_PENDING && thread_ctx().phase != Phase::IN_PROGRESS) { + CompleteIoPendingRequests(thread_ctx()); + } + Refresh(); + CompleteRetryRequests(thread_ctx()); + + done = (thread_ctx().pending_ios.empty() && thread_ctx().retry_requests.empty()); + + if(thread_ctx().phase != Phase::REST) { + CompleteIoPendingRequests(prev_thread_ctx()); + Refresh(); + CompleteRetryRequests(prev_thread_ctx()); + done = false; + } + if(done) { + return true; + } + } while(wait); + return false; +} + +template +inline void FasterKv::CompleteIoPendingRequests(ExecutionContext& context) { + AsyncIOContext* ctxt; + // Clear this thread's I/O response queue. (Does not clear I/Os issued by this thread that have + // not yet completed.) + while(context.io_responses.try_pop(ctxt)) { + CallbackContext io_context{ ctxt }; + CallbackContext pending_context{ io_context->caller_context }; + // This I/O is no longer pending, since we popped its response off the queue. + auto pending_io = context.pending_ios.find(io_context->io_id); + assert(pending_io != context.pending_ios.end()); + context.pending_ios.erase(pending_io); + + // Issue the continue command + OperationStatus internal_status; + if(pending_context->type == OperationType::Read) { + internal_status = InternalContinuePendingRead(context, *io_context.get()); + } else { + assert(pending_context->type == OperationType::RMW); + internal_status = InternalContinuePendingRmw(context, *io_context.get()); + } + Status result; + if(internal_status == OperationStatus::SUCCESS) { + result = Status::Ok; + } else if(internal_status == OperationStatus::NOT_FOUND) { + result = Status::NotFound; + } else { + result = HandleOperationStatus(context, *pending_context.get(), internal_status, + pending_context.async); + } + if(!pending_context.async) { + pending_context->caller_callback(pending_context->caller_context, result); + } + } +} + +template +inline void FasterKv::CompleteRetryRequests(ExecutionContext& context) { + // If we can't complete a request, it will be pushed back onto the deque. Retry each request + // only once. + size_t size = context.retry_requests.size(); + for(size_t idx = 0; idx < size; ++idx) { + CallbackContext pending_context{ context.retry_requests.front() }; + context.retry_requests.pop_front(); + // Issue retry command + OperationStatus internal_status; + switch(pending_context->type) { + case OperationType::RMW: + internal_status = InternalRetryPendingRmw( + *static_cast(pending_context.get())); + break; + case OperationType::Upsert: + internal_status = InternalUpsert( + *static_cast(pending_context.get())); + break; + default: + assert(false); + throw std::runtime_error{ "Cannot happen!" }; + } + // Handle operation status + Status result; + if(internal_status == OperationStatus::SUCCESS) { + result = Status::Ok; + } else { + result = HandleOperationStatus(context, *pending_context.get(), internal_status, + pending_context.async); + } + + // If done, callback user code. + if(!pending_context.async) { + pending_context->caller_callback(pending_context->caller_context, result); + } + } +} + +template +template +inline OperationStatus FasterKv::InternalRead(C& pending_context) const { + typedef C pending_read_context_t; + + if(thread_ctx().phase != Phase::REST) { + const_cast(this)->HeavyEnter(); + } + + const key_t& key = pending_context.key(); + KeyHash hash = key.GetHash(); + const AtomicHashBucketEntry* atomic_entry = FindEntry(hash); + if(!atomic_entry) { + // no record found + return OperationStatus::NOT_FOUND; + } + + HashBucketEntry entry = atomic_entry->load(); + Address address = entry.address(); + Address begin_address = hlog.begin_address.load(); + Address head_address = hlog.head_address.load(); + Address safe_read_only_address = hlog.safe_read_only_address.load(); + Address read_only_address = hlog.read_only_address.load(); + uint64_t latest_record_version = 0; + + if(address >= head_address) { + // Look through the in-memory portion of the log, to find the first record (if any) whose key + // matches. + const record_t* record = reinterpret_cast(hlog.Get(address)); + latest_record_version = record->header.checkpoint_version; + if(key != record->key()) { + address = TraceBackForKeyMatch(key, record->header.previous_address(), head_address); + } + } + + switch(thread_ctx().phase) { + case Phase::PREPARE: + // Reading old version (v). + if(latest_record_version > thread_ctx().version) { + // CPR shift detected: we are in the "PREPARE" phase, and a record has a version later than + // what we've seen. + pending_context.go_async(thread_ctx().phase, thread_ctx().version, address, entry); + return OperationStatus::CPR_SHIFT_DETECTED; + } + break; + default: + break; + } + + if(address >= safe_read_only_address) { + // Mutable or fuzzy region + // concurrent read + pending_context.GetAtomic(hlog.Get(address)); + return OperationStatus::SUCCESS; + } else if(address >= head_address) { + // Immutable region + // single-thread read + pending_context.Get(hlog.Get(address)); + return OperationStatus::SUCCESS; + } else if(address >= begin_address) { + // Record not available in-memory + pending_context.go_async(thread_ctx().phase, thread_ctx().version, address, entry); + return OperationStatus::RECORD_ON_DISK; + } else { + // No record found + return OperationStatus::NOT_FOUND; + } +} + +template +template +inline OperationStatus FasterKv::InternalUpsert(C& pending_context) { + typedef C pending_upsert_context_t; + + if(thread_ctx().phase != Phase::REST) { + HeavyEnter(); + } + + const key_t& key = pending_context.key(); + KeyHash hash = key.GetHash(); + HashBucketEntry expected_entry; + HashBucket* bucket; + AtomicHashBucketEntry* atomic_entry = FindOrCreateEntry(hash, expected_entry, bucket); + + // (Note that address will be Address::kInvalidAddress, if the atomic_entry was created.) + Address address = expected_entry.address(); + Address head_address = hlog.head_address.load(); + Address read_only_address = hlog.read_only_address.load(); + uint64_t latest_record_version = 0; + + if(address >= head_address) { + // Multiple keys may share the same hash. Try to find the most recent record with a matching + // key that we might be able to update in place. + record_t* record = reinterpret_cast(hlog.Get(address)); + latest_record_version = record->header.checkpoint_version; + if(key != record->key()) { + address = TraceBackForKeyMatch(key, record->header.previous_address(), head_address); + } + } + + CheckpointLockGuard lock_guard{ checkpoint_locks_, hash }; + + // The common case + if(thread_ctx().phase == Phase::REST && address >= read_only_address) { + record_t* record = reinterpret_cast(hlog.Get(address)); + if(pending_context.PutAtomic(record)) { + return OperationStatus::SUCCESS; + } else { + // Must retry as RCU. + goto create_record; + } + } + + // Acquire necessary locks. + switch(thread_ctx().phase) { + case Phase::PREPARE: + // Working on old version (v). + if(!lock_guard.try_lock_old()) { + pending_context.go_async(thread_ctx().phase, thread_ctx().version, address, expected_entry); + return OperationStatus::CPR_SHIFT_DETECTED; + } else { + if(latest_record_version > thread_ctx().version) { + // CPR shift detected: we are in the "PREPARE" phase, and a record has a version later than + // what we've seen. + pending_context.go_async(thread_ctx().phase, thread_ctx().version, address, + expected_entry); + return OperationStatus::CPR_SHIFT_DETECTED; + } + } + break; + case Phase::IN_PROGRESS: + // All other threads are in phase {PREPARE,IN_PROGRESS,WAIT_PENDING}. + if(latest_record_version < thread_ctx().version) { + // Will create new record or update existing record to new version (v+1). + if(!lock_guard.try_lock_new()) { + pending_context.go_async(thread_ctx().phase, thread_ctx().version, address, + expected_entry); + return OperationStatus::RETRY_LATER; + } else { + // Update to new version (v+1) requires RCU. + goto create_record; + } + } + break; + case Phase::WAIT_PENDING: + // All other threads are in phase {IN_PROGRESS,WAIT_PENDING,WAIT_FLUSH}. + if(latest_record_version < thread_ctx().version) { + if(lock_guard.old_locked()) { + pending_context.go_async(thread_ctx().phase, thread_ctx().version, address, + expected_entry); + return OperationStatus::RETRY_LATER; + } else { + // Update to new version (v+1) requires RCU. + goto create_record; + } + } + break; + case Phase::WAIT_FLUSH: + // All other threads are in phase {WAIT_PENDING,WAIT_FLUSH,PERSISTENCE_CALLBACK}. + if(latest_record_version < thread_ctx().version) { + goto create_record; + } + break; + default: + break; + } + + if(address >= read_only_address) { + // Mutable region; try to update in place. + if(atomic_entry->load() != expected_entry) { + // Some other thread may have RCUed the record before we locked it; try again. + return OperationStatus::RETRY_NOW; + } + // We acquired the necessary locks, so so we can update the record's bucket atomically. + record_t* record = reinterpret_cast(hlog.Get(address)); + if(pending_context.PutAtomic(record)) { + // Host successfully replaced record, atomically. + return OperationStatus::SUCCESS; + } else { + // Must retry as RCU. + goto create_record; + } + } + + // Create a record and attempt RCU. +create_record: + uint32_t record_size = record_t::size(key, pending_context.value_size()); + Address new_address = BlockAllocate(record_size); + record_t* record = reinterpret_cast(hlog.Get(new_address)); + new(record) record_t{ + RecordInfo{ + static_cast(thread_ctx().version), true, false, false, + expected_entry.address() }, + key }; + pending_context.Put(record); + + HashBucketEntry updated_entry{ new_address, hash.tag(), false }; + + if(atomic_entry->compare_exchange_strong(expected_entry, updated_entry)) { + // Installed the new record in the hash table. + return OperationStatus::SUCCESS; + } else { + // Try again. + record->header.invalid = true; + return InternalUpsert(pending_context); + } +} + +template +template +inline OperationStatus FasterKv::InternalRmw(C& pending_context, bool retrying) { + typedef C pending_rmw_context_t; + + Phase phase = retrying ? pending_context.phase : thread_ctx().phase; + uint32_t version = retrying ? pending_context.version : thread_ctx().version; + + if(phase != Phase::REST) { + HeavyEnter(); + } + + const key_t& key = pending_context.key(); + KeyHash hash = key.GetHash(); + HashBucketEntry expected_entry; + HashBucket* bucket; + AtomicHashBucketEntry* atomic_entry = FindOrCreateEntry(hash, expected_entry, bucket); + + // (Note that address will be Address::kInvalidAddress, if the atomic_entry was created.) + Address address = expected_entry.address(); + Address begin_address = hlog.begin_address.load(); + Address head_address = hlog.head_address.load(); + Address read_only_address = hlog.read_only_address.load(); + Address safe_read_only_address = hlog.safe_read_only_address.load(); + uint64_t latest_record_version = 0; + + if(address >= head_address) { + // Multiple keys may share the same hash. Try to find the most recent record with a matching + // key that we might be able to update in place. + record_t* record = reinterpret_cast(hlog.Get(address)); + latest_record_version = record->header.checkpoint_version; + if(key != record->key()) { + address = TraceBackForKeyMatch(key, record->header.previous_address(), head_address); + } + } + + CheckpointLockGuard lock_guard{ checkpoint_locks_, hash }; + + // The common case. + if(phase == Phase::REST && address >= read_only_address) { + record_t* record = reinterpret_cast(hlog.Get(address)); + if(pending_context.RmwAtomic(record)) { + // In-place RMW succeeded. + return OperationStatus::SUCCESS; + } else { + // Must retry as RCU. + goto create_record; + } + } + + // Acquire necessary locks. + switch(phase) { + case Phase::PREPARE: + // Working on old version (v). + if(!lock_guard.try_lock_old()) { + // If we're retrying the operation, then we already have an old lock, so we'll always + // succeed in obtaining a second. Otherwise, another thread has acquired the new lock, so + // a CPR shift has occurred. + assert(!retrying); + pending_context.go_async(phase, version, address, expected_entry); + return OperationStatus::CPR_SHIFT_DETECTED; + } else { + if(latest_record_version > version) { + // CPR shift detected: we are in the "PREPARE" phase, and a mutable record has a version + // later than what we've seen. + assert(!retrying); + pending_context.go_async(phase, version, address, expected_entry); + return OperationStatus::CPR_SHIFT_DETECTED; + } + } + break; + case Phase::IN_PROGRESS: + // All other threads are in phase {PREPARE,IN_PROGRESS,WAIT_PENDING}. + if(latest_record_version < version) { + // Will create new record or update existing record to new version (v+1). + if(!lock_guard.try_lock_new()) { + if(!retrying) { + pending_context.go_async(phase, version, address, expected_entry); + } else { + pending_context.continue_async(address, expected_entry); + } + return OperationStatus::RETRY_LATER; + } else { + // Update to new version (v+1) requires RCU. + goto create_record; + } + } + break; + case Phase::WAIT_PENDING: + // All other threads are in phase {IN_PROGRESS,WAIT_PENDING,WAIT_FLUSH}. + if(latest_record_version < version) { + if(lock_guard.old_locked()) { + if(!retrying) { + pending_context.go_async(phase, version, address, expected_entry); + } else { + pending_context.continue_async(address, expected_entry); + } + return OperationStatus::RETRY_LATER; + } else { + // Update to new version (v+1) requires RCU. + goto create_record; + } + } + break; + case Phase::WAIT_FLUSH: + // All other threads are in phase {WAIT_PENDING,WAIT_FLUSH,PERSISTENCE_CALLBACK}. + if(latest_record_version < version) { + goto create_record; + } + break; + default: + break; + } + + if(address >= read_only_address) { + // Mutable region. Try to update in place. + if(atomic_entry->load() != expected_entry) { + // Some other thread may have RCUed the record before we locked it; try again. + return OperationStatus::RETRY_NOW; + } + // We acquired the necessary locks, so so we can update the record's bucket atomically. + record_t* record = reinterpret_cast(hlog.Get(address)); + if(pending_context.RmwAtomic(record)) { + // In-place RMW succeeded. + return OperationStatus::SUCCESS; + } else { + // Must retry as RCU. + goto create_record; + } + } else if(address >= safe_read_only_address) { + // Fuzzy Region: Must go pending due to lost-update anomaly + if(!retrying) { + pending_context.go_async(phase, version, address, expected_entry); + } else { + pending_context.continue_async(address, expected_entry); + } + return OperationStatus::RETRY_LATER; + } else if(address >= head_address) { + goto create_record; + } else if(address >= begin_address) { + // Need to obtain old record from disk. + if(!retrying) { + pending_context.go_async(phase, version, address, expected_entry); + } else { + pending_context.continue_async(address, expected_entry); + } + return OperationStatus::RECORD_ON_DISK; + } else { + // Create a new record. + goto create_record; + } + + // Create a record and attempt RCU. +create_record: + uint32_t record_size = record_t::size(key, pending_context.value_size()); + Address new_address = BlockAllocate(record_size); + record_t* new_record = reinterpret_cast(hlog.Get(new_address)); + + // Allocating a block may have the side effect of advancing the head address. + head_address = hlog.head_address.load(); + // Allocating a block may have the side effect of advancing the thread context's version and + // phase. + if(!retrying) { + phase = thread_ctx().phase; + version = thread_ctx().version; + } + + new(new_record) record_t{ + RecordInfo{ + static_cast(version), true, false, false, + expected_entry.address() }, + key }; + if(address < hlog.begin_address.load()) { + pending_context.RmwInitial(new_record); + } else if(address >= head_address) { + const record_t* old_record = reinterpret_cast(hlog.Get(address)); + pending_context.RmwCopy(old_record, new_record); + } else { + // The block we allocated for the new record caused the head address to advance beyond + // the old record. Need to obtain the old record from disk. + new_record->header.invalid = true; + if(!retrying) { + pending_context.go_async(phase, version, address, expected_entry); + } else { + pending_context.continue_async(address, expected_entry); + } + return OperationStatus::RECORD_ON_DISK; + } + + HashBucketEntry updated_entry{ new_address, hash.tag(), false }; + if(atomic_entry->compare_exchange_strong(expected_entry, updated_entry)) { + return OperationStatus::SUCCESS; + } else { + // CAS failed; try again. + new_record->header.invalid = true; + if(!retrying) { + pending_context.go_async(phase, version, address, expected_entry); + } else { + pending_context.continue_async(address, expected_entry); + } + return OperationStatus::RETRY_NOW; + } +} + +template +inline OperationStatus FasterKv::InternalRetryPendingRmw( + async_pending_rmw_context_t& pending_context) { + OperationStatus status = InternalRmw(pending_context, true); + if(status == OperationStatus::SUCCESS && pending_context.version != thread_ctx().version) { + status = OperationStatus::SUCCESS_UNMARK; + } + return status; +} + +template +inline Address FasterKv::TraceBackForKeyMatch(const key_t& key, Address from_address, + Address min_offset) const { + while(from_address >= min_offset) { + const record_t* record = reinterpret_cast(hlog.Get(from_address)); + if(key == record->key()) { + return from_address; + } else { + from_address = record->header.previous_address(); + continue; + } + } + return from_address; +} + +template +inline Status FasterKv::HandleOperationStatus(ExecutionContext& ctx, + pending_context_t& pending_context, OperationStatus internal_status, bool& async) { + async = false; + switch(internal_status) { + case OperationStatus::RETRY_NOW: + switch(pending_context.type) { + case OperationType::Read: { + async_pending_read_context_t& read_context = + *static_cast(&pending_context); + internal_status = InternalRead(read_context); + break; + } + case OperationType::Upsert: { + async_pending_upsert_context_t& upsert_context = + *static_cast(&pending_context); + internal_status = InternalUpsert(upsert_context); + break; + } + case OperationType::RMW: { + async_pending_rmw_context_t& rmw_context = + *static_cast(&pending_context); + internal_status = InternalRmw(rmw_context, false); + break; + } + } + + if(internal_status == OperationStatus::SUCCESS) { + return Status::Ok; + } else { + return HandleOperationStatus(ctx, pending_context, internal_status, async); + } + case OperationStatus::RETRY_LATER: + if(thread_ctx().phase == Phase::PREPARE) { + assert(pending_context.type == OperationType::RMW); + // Can I be marking an operation again and again? + if(!checkpoint_locks_.get_lock(pending_context.key().GetHash()).try_lock_old()) { + return PivotAndRetry(ctx, pending_context, async); + } + } + return RetryLater(ctx, pending_context, async); + case OperationStatus::RECORD_ON_DISK: + if(thread_ctx().phase == Phase::PREPARE) { + assert(pending_context.type == OperationType::Read || + pending_context.type == OperationType::RMW); + // Can I be marking an operation again and again? + if(!checkpoint_locks_.get_lock(pending_context.key().GetHash()).try_lock_old()) { + return PivotAndRetry(ctx, pending_context, async); + } + } + return IssueAsyncIoRequest(ctx, pending_context, async); + case OperationStatus::SUCCESS_UNMARK: + checkpoint_locks_.get_lock(pending_context.key().GetHash()).unlock_old(); + return Status::Ok; + case OperationStatus::NOT_FOUND_UNMARK: + checkpoint_locks_.get_lock(pending_context.key().GetHash()).unlock_old(); + return Status::NotFound; + case OperationStatus::CPR_SHIFT_DETECTED: + return PivotAndRetry(ctx, pending_context, async); + } + // not reached + assert(false); + return Status::Corruption; +} + +template +inline Status FasterKv::PivotAndRetry(ExecutionContext& ctx, + pending_context_t& pending_context, bool& async) { + // Some invariants + assert(ctx.version == thread_ctx().version); + assert(thread_ctx().phase == Phase::PREPARE); + Refresh(); + // thread must have moved to IN_PROGRESS phase + assert(thread_ctx().version == ctx.version + 1); + // retry with new contexts + pending_context.phase = thread_ctx().phase; + pending_context.version = thread_ctx().version; + return HandleOperationStatus(thread_ctx(), pending_context, OperationStatus::RETRY_NOW, async); +} + +template +inline Status FasterKv::RetryLater(ExecutionContext& ctx, + pending_context_t& pending_context, bool& async) { + IAsyncContext* context_copy; + Status result = pending_context.DeepCopy(context_copy); + if(result == Status::Ok) { + async = true; + ctx.retry_requests.push_back(context_copy); + return Status::Pending; + } else { + async = false; + return result; + } +} + +template +inline constexpr uint32_t FasterKv::MinIoRequestSize() const { + return static_cast( + sizeof(value_t) + pad_alignment(record_t::min_disk_key_size(), + alignof(value_t))); +} + +template +inline Status FasterKv::IssueAsyncIoRequest(ExecutionContext& ctx, + pending_context_t& pending_context, bool& async) { + // Issue asynchronous I/O request + uint64_t io_id = thread_ctx().io_id++; + thread_ctx().pending_ios.insert({ io_id, pending_context.key().GetHash() }); + async = true; + AsyncIOContext io_request{ this, pending_context.address, &pending_context, + &thread_ctx().io_responses, io_id }; + AsyncGetFromDisk(pending_context.address, MinIoRequestSize(), AsyncGetFromDiskCallback, + io_request); + return Status::Pending; +} + +template +inline Address FasterKv::BlockAllocate(uint32_t record_size) { + uint32_t page; + Address retval = hlog.Allocate(record_size, page); + while(retval < hlog.read_only_address.load()) { + Refresh(); + // Don't overrun the hlog's tail offset. + bool page_closed = (retval == Address::kInvalidAddress); + while(page_closed) { + page_closed = !hlog.NewPage(page); + Refresh(); + } + retval = hlog.Allocate(record_size, page); + } + return retval; +} + +template +void FasterKv::AsyncGetFromDisk(Address address, uint32_t num_records, + AsyncIOCallback callback, AsyncIOContext& context) { + if(epoch_.IsProtected()) { + /// Throttling. (Thread pool, unprotected threads are not throttled.) + while(num_pending_ios.load() > 120) { + disk.TryComplete(); + std::this_thread::yield(); + epoch_.ProtectAndDrain(); + } + } + ++num_pending_ios; + hlog.AsyncGetFromDisk(address, num_records, callback, context); +} + +template +void FasterKv::AsyncGetFromDiskCallback(IAsyncContext* ctxt, Status result, + size_t bytes_transferred) { + CallbackContext context{ ctxt }; + faster_t* faster = reinterpret_cast(context->faster); + /// Context stack is: AsyncIOContext, PendingContext. + pending_context_t* pending_context = static_cast(context->caller_context); + + /// This I/O is finished. + --faster->num_pending_ios; + /// Always "goes async": context is freed by the issuing thread, when processing thread I/O + /// responses. + context.async = true; + + pending_context->result = result; + if(result == Status::Ok) { + record_t* record = reinterpret_cast(context->record.GetValidPointer()); + // Size of the record we read from disk (might not have read the entire record, yet). + size_t record_size = context->record.available_bytes; + if(record->min_disk_key_size() > record_size) { + // Haven't read the full record in yet; I/O is not complete! + faster->AsyncGetFromDisk(context->address, record->min_disk_key_size(), + AsyncGetFromDiskCallback, *context.get()); + context.async = true; + } else if(record->min_disk_value_size() > record_size) { + // Haven't read the full record in yet; I/O is not complete! + faster->AsyncGetFromDisk(context->address, record->min_disk_value_size(), + AsyncGetFromDiskCallback, *context.get()); + context.async = true; + } else if(record->disk_size() > record_size) { + // Haven't read the full record in yet; I/O is not complete! + faster->AsyncGetFromDisk(context->address, record->disk_size(), + AsyncGetFromDiskCallback, *context.get()); + context.async = true; + } else if(pending_context->key() == record->key()) { + //The keys are same, so I/O is complete + context->thread_io_responses->push(context.get()); + } else { + //keys are not same. I/O is not complete + context->address = record->header.previous_address(); + if(context->address >= faster->hlog.begin_address.load()) { + faster->AsyncGetFromDisk(context->address, faster->MinIoRequestSize(), + AsyncGetFromDiskCallback, *context.get()); + context.async = true; + } else { + // Record not found, so I/O is complete. + context->thread_io_responses->push(context.get()); + } + } + } +} + +template +OperationStatus FasterKv::InternalContinuePendingRead(ExecutionContext& context, + AsyncIOContext& io_context) { + if(io_context.address >= hlog.begin_address.load()) { + async_pending_read_context_t* pending_context = static_cast( + io_context.caller_context); + record_t* record = reinterpret_cast(io_context.record.GetValidPointer()); + pending_context->Get(record); + assert(!kCopyReadsToTail); + return (thread_ctx().version > context.version) ? OperationStatus::SUCCESS_UNMARK : + OperationStatus::SUCCESS; + } else { + return (thread_ctx().version > context.version) ? OperationStatus::NOT_FOUND_UNMARK : + OperationStatus::NOT_FOUND; + } +} + +template +OperationStatus FasterKv::InternalContinuePendingRmw(ExecutionContext& context, + AsyncIOContext& io_context) { + async_pending_rmw_context_t* pending_context = static_cast( + io_context.caller_context); + + // Find a hash bucket entry to store the updated value in. + const key_t& key = pending_context->key(); + KeyHash hash = key.GetHash(); + HashBucketEntry expected_entry; + HashBucket* bucket; + AtomicHashBucketEntry* atomic_entry = FindOrCreateEntry(hash, expected_entry, bucket); + + // (Note that address will be Address::kInvalidAddress, if the atomic_entry was created.) + Address address = expected_entry.address(); + Address head_address = hlog.head_address.load(); + + // Make sure that atomic_entry is OK to update. + if(address >= head_address) { + record_t* record = reinterpret_cast(hlog.Get(address)); + if(key != record->key()) { + address = TraceBackForKeyMatch(key, record->header.previous_address(), head_address); + } + } + + if(address > pending_context->entry.address()) { + // We can't trace the current hash bucket entry back to the record we read. + pending_context->continue_async(address, expected_entry); + return OperationStatus::RETRY_NOW; + } + assert(address < hlog.begin_address.load() || address == pending_context->entry.address()); + + // We have to do copy-on-write/RCU and write the updated value to the tail of the log. + uint32_t record_size = record_t::size(key, pending_context->value_size()); + Address new_address = BlockAllocate(record_size); + record_t* new_record = reinterpret_cast(hlog.Get(new_address)); + + new(new_record) record_t{ + RecordInfo{ + static_cast(context.version), true, false, false, + expected_entry.address() }, + key }; + if(io_context.address < hlog.begin_address.load()) { + // The on-disk trace back failed to find a key match. + pending_context->RmwInitial(new_record); + } else { + // The record we read from disk. + const record_t* disk_record = reinterpret_cast( + io_context.record.GetValidPointer()); + pending_context->RmwCopy(disk_record, new_record); + } + + HashBucketEntry updated_entry{ new_address, hash.tag(), false }; + if(atomic_entry->compare_exchange_strong(expected_entry, updated_entry)) { + assert(thread_ctx().version >= context.version); + return (thread_ctx().version == context.version) ? OperationStatus::SUCCESS : + OperationStatus::SUCCESS_UNMARK; + } else { + // CAS failed; try again. + new_record->header.invalid = true; + pending_context->continue_async(address, expected_entry); + return OperationStatus::RETRY_NOW; + } +} + +template +void FasterKv::InitializeCheckpointLocks() { + uint32_t table_version = resize_info_.version; + uint64_t size = state_[table_version].size(); + checkpoint_locks_.Initialize(size); +} + +template +Status FasterKv::WriteIndexMetadata() { + uint32_t checkpoint_version = checkpoint_.index_metadata.version; + std::string filename = disk.index_checkpoint_path(checkpoint_version) + "info.dat"; + // (This code will need to be refactored into the disk_t interface, if we want to support + // unformatted disks.) + std::FILE* file = std::fopen(filename.c_str(), "wb"); + if(!file) { + return Status::IOError; + } + if(std::fwrite(&checkpoint_.index_metadata, sizeof(checkpoint_.index_metadata), 1, file) != 1) { + std::fclose(file); + return Status::IOError; + } + if(std::fclose(file) != 0) { + return Status::IOError; + } + return Status::Ok; +} + +template +Status FasterKv::ReadIndexMetadata(uint32_t version) { + std::string filename = disk.index_checkpoint_path(version) + "info.dat"; + // (This code will need to be refactored into the disk_t interface, if we want to support + // unformatted disks.) + std::FILE* file = std::fopen(filename.c_str(), "rb"); + if(!file) { + return Status::IOError; + } + if(std::fread(&checkpoint_.index_metadata, sizeof(checkpoint_.index_metadata), 1, file) != 1) { + std::fclose(file); + return Status::IOError; + } + if(std::fclose(file) != 0) { + return Status::IOError; + } + return Status::Ok; +} + +template +Status FasterKv::WriteCprMetadata() { + uint32_t checkpoint_version = checkpoint_.log_metadata.version; + std::string filename = disk.cpr_checkpoint_path(checkpoint_version) + "info.dat"; + // (This code will need to be refactored into the disk_t interface, if we want to support + // unformatted disks.) + std::FILE* file = std::fopen(filename.c_str(), "wb"); + if(!file) { + return Status::IOError; + } + if(std::fwrite(&checkpoint_.log_metadata, sizeof(checkpoint_.log_metadata), 1, file) != 1) { + std::fclose(file); + return Status::IOError; + } + if(std::fclose(file) != 0) { + return Status::IOError; + } + return Status::Ok; +} + +template +Status FasterKv::ReadCprMetadata(uint32_t version) { + std::string filename = disk.cpr_checkpoint_path(version) + "info.dat"; + // (This code will need to be refactored into the disk_t interface, if we want to support + // unformatted disks.) + std::FILE* file = std::fopen(filename.c_str(), "rb"); + if(!file) { + return Status::IOError; + } + if(std::fread(&checkpoint_.log_metadata, sizeof(checkpoint_.log_metadata), 1, file) != 1) { + std::fclose(file); + return Status::IOError; + } + if(std::fclose(file) != 0) { + return Status::IOError; + } + return Status::Ok; +} + +template +Status FasterKv::WriteCprContext() { + uint32_t checkpoint_version = prev_thread_ctx().version; + std::string filename = disk.cpr_checkpoint_path(checkpoint_version); + const Guid& guid = prev_thread_ctx().guid; + filename += guid.ToString(); + filename += ".dat"; + // (This code will need to be refactored into the disk_t interface, if we want to support + // unformatted disks.) + std::FILE* file = std::fopen(filename.c_str(), "wb"); + if(!file) { + return Status::IOError; + } + if(std::fwrite(static_cast(&prev_thread_ctx()), + sizeof(PersistentExecContext), 1, file) != 1) { + std::fclose(file); + return Status::IOError; + } + if(std::fclose(file) != 0) { + return Status::IOError; + } + return Status::Ok; +} + +template +Status FasterKv::ReadCprContexts(uint32_t version, const Guid* guids) { + for(size_t idx = 0; idx < Thread::kMaxNumThreads; ++idx) { + const Guid& guid = guids[idx]; + if(guid == Guid{}) { + continue; + } + std::string filename = disk.cpr_checkpoint_path(version); + filename += guid.ToString(); + filename += ".dat"; + // (This code will need to be refactored into the disk_t interface, if we want to support + // unformatted disks.) + std::FILE* file = std::fopen(filename.c_str(), "rb"); + if(!file) { + return Status::IOError; + } + PersistentExecContext context{}; + if(std::fread(&context, sizeof(PersistentExecContext), 1, file) != 1) { + std::fclose(file); + return Status::IOError; + } + if(std::fclose(file) != 0) { + return Status::IOError; + } + auto result = checkpoint_.continue_tokens.insert({ context.guid, context.serial_num }); + assert(result.second); + } + if(checkpoint_.continue_tokens.size() != checkpoint_.log_metadata.num_threads) { + return Status::Corruption; + } else { + return Status::Ok; + } +} + +template +Status FasterKv::CheckpointFuzzyIndex() { + uint32_t hash_table_version = resize_info_.version; + uint32_t checkpoint_version = checkpoint_.index_metadata.version; + // Checkpoint the main hash table. + file_t ht_file = disk.NewFile(disk.relative_index_checkpoint_path(checkpoint_version) + + "ht.dat"); + RETURN_NOT_OK(ht_file.Open(&disk.handler())); + RETURN_NOT_OK(state_[hash_table_version].Checkpoint(disk, std::move(ht_file), + checkpoint_.index_metadata.num_ht_bytes)); + // Checkpoint the hash table's overflow buckets. + file_t ofb_file = disk.NewFile(disk.relative_index_checkpoint_path(checkpoint_version) + + "ofb.dat"); + RETURN_NOT_OK(ofb_file.Open(&disk.handler())); + RETURN_NOT_OK(overflow_buckets_allocator_[hash_table_version].Checkpoint(disk, + std::move(ofb_file), checkpoint_.index_metadata.num_ofb_bytes)); + checkpoint_.index_checkpoint_started = true; + return Status::Ok; +} + +template +Status FasterKv::CheckpointFuzzyIndexComplete() { + if(!checkpoint_.index_checkpoint_started) { + return Status::Pending; + } + uint32_t hash_table_version = resize_info_.version; + Status result = state_[hash_table_version].CheckpointComplete(false); + if(result == Status::Pending) { + return Status::Pending; + } else if(result != Status::Ok) { + return result; + } else { + return overflow_buckets_allocator_[hash_table_version].CheckpointComplete(false); + } +} + +template +Status FasterKv::RecoverFuzzyIndex() { + uint8_t hash_table_version = resize_info_.version; + uint32_t checkpoint_version = checkpoint_.index_metadata.version; + assert(state_[hash_table_version].size() == checkpoint_.index_metadata.table_size); + + // Recover the main hash table. + file_t ht_file = disk.NewFile(disk.relative_index_checkpoint_path(checkpoint_version) + + "ht.dat"); + RETURN_NOT_OK(ht_file.Open(&disk.handler())); + RETURN_NOT_OK(state_[hash_table_version].Recover(disk, std::move(ht_file), + checkpoint_.index_metadata.num_ht_bytes)); + // Recover the hash table's overflow buckets. + file_t ofb_file = disk.NewFile(disk.relative_index_checkpoint_path(checkpoint_version) + + "ofb.dat"); + RETURN_NOT_OK(ofb_file.Open(&disk.handler())); + return overflow_buckets_allocator_[hash_table_version].Recover(disk, std::move(ofb_file), + checkpoint_.index_metadata.num_ofb_bytes, checkpoint_.index_metadata.ofb_count); +} + +template +Status FasterKv::RecoverFuzzyIndexComplete(bool wait) { + uint8_t hash_table_version = resize_info_.version; + Status result = state_[hash_table_version].RecoverComplete(true); + if(result != Status::Ok) { + return result; + } + result = overflow_buckets_allocator_[hash_table_version].RecoverComplete(true); + if(result != Status::Ok) { + return result; + } + + // Clear all tentative entries. + for(uint64_t bucket_idx = 0; bucket_idx < state_[hash_table_version].size(); ++bucket_idx) { + HashBucket* bucket = &state_[hash_table_version].bucket(bucket_idx); + while(true) { + for(uint32_t entry_idx = 0; entry_idx < HashBucket::kNumEntries; ++entry_idx) { + if(bucket->entries[entry_idx].load().tentative()) { + bucket->entries[entry_idx].store(HashBucketEntry::kInvalidEntry); + } + } + // Go to next bucket in the chain + HashBucketOverflowEntry entry = bucket->overflow_entry.load(); + if(entry.unused()) { + // No more buckets in the chain. + break; + } + bucket = &overflow_buckets_allocator_[hash_table_version].Get(entry.address()); + assert(reinterpret_cast(bucket) % Constants::kCacheLineBytes == 0); + } + } + return Status::Ok; +} + +template +Status FasterKv::RecoverHybridLog() { + class Context : public IAsyncContext { + public: + Context(hlog_t& hlog_, uint32_t page_, RecoveryStatus& recovery_status_) + : hlog{ &hlog_} + , page{ page_ } + , recovery_status{ &recovery_status_ } { + } + /// The deep-copy constructor + Context(const Context& other) + : hlog{ other.hlog } + , page{ other.page } + , recovery_status{ other.recovery_status } { + } + protected: + Status DeepCopy_Internal(IAsyncContext*& context_copy) final { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + public: + hlog_t* hlog; + uint32_t page; + RecoveryStatus* recovery_status; + }; + + auto callback = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + result = context->hlog->AsyncReadPagesFromLog(context->page, 1, *context->recovery_status); + }; + + Address from_address = checkpoint_.index_metadata.checkpoint_start_address; + Address to_address = checkpoint_.log_metadata.final_address; + + uint32_t start_page = from_address.page(); + uint32_t end_page = to_address.offset() > 0 ? to_address.page() + 1 : to_address.page(); + uint32_t capacity = hlog.buffer_size(); + RecoveryStatus recovery_status{ start_page, end_page }; + // Initially issue read request for all pages that can be held in memory + uint32_t total_pages_to_read = end_page - start_page; + uint32_t pages_to_read_first = std::min(capacity, total_pages_to_read); + RETURN_NOT_OK(hlog.AsyncReadPagesFromLog(start_page, pages_to_read_first, recovery_status)); + + for(uint32_t page = start_page; page < end_page; ++page) { + while(recovery_status.page_status(page) != PageRecoveryStatus::ReadDone) { + disk.TryComplete(); + std::this_thread::sleep_for(10ms); + } + + // handle start and end at non-page boundaries + RETURN_NOT_OK(RecoverFromPage(page == start_page ? from_address : Address{ page, 0 }, + page + 1 == end_page ? to_address : + Address{ page, Address::kMaxOffset })); + + // OS thread flushes current page and issues a read request if necessary + if(page + capacity < end_page) { + Context context{ hlog, page + capacity, recovery_status }; + RETURN_NOT_OK(hlog.AsyncFlushPage(page, recovery_status, callback, &context)); + } else { + RETURN_NOT_OK(hlog.AsyncFlushPage(page, recovery_status, nullptr, nullptr)); + } + } + // Wait until all pages have been flushed + for(uint32_t page = start_page; page < end_page; ++page) { + while(recovery_status.page_status(page) != PageRecoveryStatus::FlushDone) { + disk.TryComplete(); + std::this_thread::sleep_for(10ms); + } + } + return Status::Ok; +} + +template +Status FasterKv::RecoverHybridLogFromSnapshotFile() { + class Context : public IAsyncContext { + public: + Context(hlog_t& hlog_, file_t& file_, uint32_t file_start_page_, uint32_t page_, + RecoveryStatus& recovery_status_) + : hlog{ &hlog_ } + , file{ &file_ } + , file_start_page{ file_start_page_ } + , page{ page_ } + , recovery_status{ &recovery_status_ } { + } + /// The deep-copy constructor + Context(const Context& other) + : hlog{ other.hlog } + , file{ other.file } + , file_start_page{ other.file_start_page } + , page{ other.page } + , recovery_status{ other.recovery_status } { + } + protected: + Status DeepCopy_Internal(IAsyncContext*& context_copy) final { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + public: + hlog_t* hlog; + file_t* file; + uint32_t file_start_page; + uint32_t page; + RecoveryStatus* recovery_status; + }; + + auto callback = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + result = context->hlog->AsyncReadPagesFromSnapshot(*context->file, + context->file_start_page, context->page, 1, *context->recovery_status); + }; + + Address file_start_address = checkpoint_.log_metadata.flushed_address; + Address from_address = checkpoint_.index_metadata.checkpoint_start_address; + Address to_address = checkpoint_.log_metadata.final_address; + + uint32_t start_page = file_start_address.page(); + uint32_t end_page = to_address.offset() > 0 ? to_address.page() + 1 : to_address.page(); + uint32_t capacity = hlog.buffer_size(); + RecoveryStatus recovery_status{ start_page, end_page }; + checkpoint_.snapshot_file = disk.NewFile(disk.relative_cpr_checkpoint_path( + checkpoint_.log_metadata.version) + "snapshot.dat"); + RETURN_NOT_OK(checkpoint_.snapshot_file.Open(&disk.handler())); + + // Initially issue read request for all pages that can be held in memory + uint32_t total_pages_to_read = end_page - start_page; + uint32_t pages_to_read_first = std::min(capacity, total_pages_to_read); + RETURN_NOT_OK(hlog.AsyncReadPagesFromSnapshot(checkpoint_.snapshot_file, start_page, start_page, + pages_to_read_first, recovery_status)); + + for(uint32_t page = start_page; page < end_page; ++page) { + while(recovery_status.page_status(page) != PageRecoveryStatus::ReadDone) { + disk.TryComplete(); + std::this_thread::sleep_for(10ms); + } + + // Perform recovery if page in fuzzy portion of the log + if(Address{ page + 1, 0 } > from_address) { + // handle start and end at non-page boundaries + RETURN_NOT_OK(RecoverFromPage(page == from_address.page() ? from_address : + Address{ page, 0 }, + page + 1 == end_page ? to_address : + Address{ page, Address::kMaxOffset })); + } + + // OS thread flushes current page and issues a read request if necessary + if(page + capacity < end_page) { + Context context{ hlog, checkpoint_.snapshot_file, start_page, page + capacity, + recovery_status }; + RETURN_NOT_OK(hlog.AsyncFlushPage(page, recovery_status, callback, &context)); + } else { + RETURN_NOT_OK(hlog.AsyncFlushPage(page, recovery_status, nullptr, nullptr)); + } + } + // Wait until all pages have been flushed + for(uint32_t page = start_page; page < end_page; ++page) { + while(recovery_status.page_status(page) != PageRecoveryStatus::FlushDone) { + disk.TryComplete(); + std::this_thread::sleep_for(10ms); + } + } + return Status::Ok; +} + +template +Status FasterKv::RecoverFromPage(Address from_address, Address to_address) { + assert(from_address.page() == to_address.page()); + for(Address address = from_address; address < to_address;) { + record_t* record = reinterpret_cast(hlog.Get(address)); + if(record->header.IsNull()) { + address += sizeof(record->header); + continue; + } + if(record->header.invalid) { + address += record->size(); + continue; + } + const key_t& key = record->key(); + KeyHash hash = key.GetHash(); + HashBucketEntry expected_entry; + HashBucket* bucket; + AtomicHashBucketEntry* atomic_entry = FindOrCreateEntry(hash, expected_entry, bucket); + + if(record->header.checkpoint_version <= checkpoint_.log_metadata.version) { + HashBucketEntry new_entry{ address, hash.tag(), false }; + atomic_entry->store(new_entry); + } else { + record->header.invalid = true; + if(record->header.previous_address() < checkpoint_.index_metadata.checkpoint_start_address) { + HashBucketEntry new_entry{ record->header.previous_address(), hash.tag(), false }; + atomic_entry->store(new_entry); + } + } + address += record->size(); + } + + return Status::Ok; +} + +template +Status FasterKv::RestoreHybridLog() { + Address tail_address = checkpoint_.log_metadata.final_address; + uint32_t end_page = tail_address.offset() > 0 ? tail_address.page() + 1 : tail_address.page(); + uint32_t capacity = hlog.buffer_size(); + // Restore as much of the log as will fit in memory. + uint32_t start_page; + if(end_page < capacity - hlog.kNumHeadPages) { + start_page = 0; + } else { + start_page = end_page - (capacity - hlog.kNumHeadPages); + } + RecoveryStatus recovery_status{ start_page, end_page }; + + uint32_t num_pages = end_page - start_page; + RETURN_NOT_OK(hlog.AsyncReadPagesFromLog(start_page, num_pages, recovery_status)); + + // Wait until all pages have been read. + for(uint32_t page = start_page; page < end_page; ++page) { + while(recovery_status.page_status(page) != PageRecoveryStatus::ReadDone) { + disk.TryComplete(); + std::this_thread::sleep_for(10ms); + } + } + // Skip the null page. + Address head_address = start_page == 0 ? Address{ 0, Constants::kCacheLineBytes } : + Address{ start_page, 0 }; + hlog.RecoveryReset(checkpoint_.index_metadata.log_begin_address, head_address, tail_address); + return Status::Ok; +} + +template +void FasterKv::HeavyEnter() { + if(thread_ctx().phase == Phase::GC_IO_PENDING || thread_ctx().phase == Phase::GC_IN_PROGRESS) { + CleanHashTableBuckets(); + return; + } + while(thread_ctx().phase == Phase::GROW_PREPARE) { + // We spin-wait as a simplification + // Could instead do a "heavy operation" here + std::this_thread::yield(); + Refresh(); + } + if(thread_ctx().phase == Phase::GROW_IN_PROGRESS) { + SplitHashTableBuckets(); + } +} + +template +bool FasterKv::CleanHashTableBuckets() { + uint64_t chunk = gc_.next_chunk++; + if(chunk >= gc_.num_chunks) { + // No chunk left to clean. + return false; + } + uint8_t version = resize_info_.version; + Address begin_address = hlog.begin_address.load(); + uint64_t upper_bound; + if(chunk + 1 < grow_.num_chunks) { + // All chunks but the last chunk contain kGrowHashTableChunkSize elements. + upper_bound = kGrowHashTableChunkSize; + } else { + // Last chunk might contain more or fewer elements. + upper_bound = state_[version].size() - (chunk * kGcHashTableChunkSize); + } + for(uint64_t idx = 0; idx < upper_bound; ++idx) { + HashBucket* bucket = &state_[version].bucket(chunk * kGcHashTableChunkSize + idx); + while(true) { + for(uint32_t entry_idx = 0; entry_idx < HashBucket::kNumEntries; ++entry_idx) { + AtomicHashBucketEntry& atomic_entry = bucket->entries[entry_idx]; + HashBucketEntry expected_entry = atomic_entry.load(); + if(!expected_entry.unused() && expected_entry.address() != Address::kInvalidAddress && + expected_entry.address() < begin_address) { + // The record that this entry points to was truncated; try to delete the entry. + atomic_entry.compare_exchange_strong(expected_entry, HashBucketEntry::kInvalidEntry); + // If deletion failed, then some other thread must have added a new record to the entry. + } + } + // Go to next bucket in the chain. + HashBucketOverflowEntry overflow_entry = bucket->overflow_entry.load(); + if(overflow_entry.unused()) { + // No more buckets in the chain. + break; + } + bucket = &overflow_buckets_allocator_[version].Get(overflow_entry.address()); + } + } + // Done with this chunk--did some work. + return true; +} + +template +void FasterKv::AddHashEntry(HashBucket*& bucket, uint32_t& next_idx, uint8_t version, + HashBucketEntry entry) { + if(next_idx == HashBucket::kNumEntries) { + // Need to allocate a new bucket, first. + FixedPageAddress new_bucket_addr = overflow_buckets_allocator_[version].Allocate(); + HashBucketOverflowEntry new_bucket_entry{ new_bucket_addr }; + bucket->overflow_entry.store(new_bucket_entry); + bucket = &overflow_buckets_allocator_[version].Get(new_bucket_addr); + next_idx = 0; + } + bucket->entries[next_idx].store(entry); + ++next_idx; +} + +template +Address FasterKv::TraceBackForOtherChainStart(uint64_t old_size, uint64_t new_size, + Address from_address, Address min_address, uint8_t side) { + assert(side == 0 || side == 1); + // Search back as far as min_address. + while(from_address >= min_address) { + const record_t* record = reinterpret_cast(hlog.Get(from_address)); + KeyHash hash = record->key().GetHash(); + if((hash.idx(new_size) < old_size) != (side == 0)) { + // Record's key hashes to the other side. + return from_address; + } + from_address = record->header.previous_address(); + } + return from_address; +} + +template +void FasterKv::SplitHashTableBuckets() { + // This thread won't exit until all hash table buckets have been split. + Address head_address = hlog.head_address.load(); + Address begin_address = hlog.begin_address.load(); + for(uint64_t chunk = grow_.next_chunk++; chunk < grow_.num_chunks; chunk = grow_.next_chunk++) { + uint64_t old_size = state_[grow_.old_version].size(); + uint64_t new_size = state_[grow_.new_version].size(); + assert(new_size == old_size * 2); + // Split this chunk. + uint64_t upper_bound; + if(chunk + 1 < grow_.num_chunks) { + // All chunks but the last chunk contain kGrowHashTableChunkSize elements. + upper_bound = kGrowHashTableChunkSize; + } else { + // Last chunk might contain more or fewer elements. + upper_bound = old_size - (chunk * kGrowHashTableChunkSize); + } + for(uint64_t idx = 0; idx < upper_bound; ++idx) { + + // Split this (chain of) bucket(s). + HashBucket* old_bucket = &state_[grow_.old_version].bucket( + chunk * kGrowHashTableChunkSize + idx); + HashBucket* new_bucket0 = &state_[grow_.new_version].bucket( + chunk * kGrowHashTableChunkSize + idx); + HashBucket* new_bucket1 = &state_[grow_.new_version].bucket( + old_size + chunk * kGrowHashTableChunkSize + idx); + uint32_t new_entry_idx0 = 0; + uint32_t new_entry_idx1 = 0; + while(true) { + for(uint32_t old_entry_idx = 0; old_entry_idx < HashBucket::kNumEntries; ++old_entry_idx) { + HashBucketEntry old_entry = old_bucket->entries[old_entry_idx].load(); + if(old_entry.unused()) { + // Nothing to do. + continue; + } else if(old_entry.address() < head_address) { + // Can't tell which new bucket the entry should go into; put it in both. + AddHashEntry(new_bucket0, new_entry_idx0, grow_.new_version, old_entry); + AddHashEntry(new_bucket1, new_entry_idx1, grow_.new_version, old_entry); + continue; + } + + const record_t* record = reinterpret_cast(hlog.Get( + old_entry.address())); + KeyHash hash = record->key().GetHash(); + if(hash.idx(new_size) < old_size) { + // Record's key hashes to the 0 side of the new hash table. + AddHashEntry(new_bucket0, new_entry_idx0, grow_.new_version, old_entry); + Address other_address = TraceBackForOtherChainStart(old_size, new_size, + record->header.previous_address(), head_address, 0); + if(other_address >= begin_address) { + // We found a record that either is on disk or has a key that hashes to the 1 side of + // the new hash table. + AddHashEntry(new_bucket1, new_entry_idx1, grow_.new_version, + HashBucketEntry{ other_address, old_entry.tag(), false }); + } + } else { + // Record's key hashes to the 1 side of the new hash table. + AddHashEntry(new_bucket1, new_entry_idx1, grow_.new_version, old_entry); + Address other_address = TraceBackForOtherChainStart(old_size, new_size, + record->header.previous_address(), head_address, 1); + if(other_address >= begin_address) { + // We found a record that either is on disk or has a key that hashes to the 0 side of + // the new hash table. + AddHashEntry(new_bucket0, new_entry_idx0, grow_.new_version, + HashBucketEntry{ other_address, old_entry.tag(), false }); + } + } + } + // Go to next bucket in the chain. + HashBucketOverflowEntry overflow_entry = old_bucket->overflow_entry.load(); + if(overflow_entry.unused()) { + // No more buckets in the chain. + break; + } + old_bucket = &overflow_buckets_allocator_[grow_.old_version].Get(overflow_entry.address()); + } + } + // Done with this chunk. + if(--grow_.num_pending_chunks == 0) { + // Free the old hash table. + state_[grow_.old_version].Uninitialize(); + overflow_buckets_allocator_[grow_.old_version].Uninitialize(); + break; + } + } + // Thread has finished growing its part of the hash table. + thread_ctx().phase = Phase::REST; + // Thread ack that it has finished growing the hash table. + if(epoch_.FinishThreadPhase(Phase::GROW_IN_PROGRESS)) { + // Let other threads know that they can use the new hash table now. + GlobalMoveToNextState(SystemState{ Action::GrowIndex, Phase::GROW_IN_PROGRESS, + thread_ctx().version }); + } else { + while(system_state_.load().phase == Phase::GROW_IN_PROGRESS) { + // Spin until all other threads have finished splitting their chunks. + std::this_thread::yield(); + } + } +} + +template +bool FasterKv::GlobalMoveToNextState(SystemState current_state) { + SystemState next_state = current_state.GetNextState(); + if(!system_state_.compare_exchange_strong(current_state, next_state)) { + return false; + } + + switch(next_state.action) { + case Action::Checkpoint: + switch(next_state.phase) { + case Phase::PREP_INDEX_CHKPT: + // This case is handled directly inside Checkpoint(). + assert(false); + break; + case Phase::INDEX_CHKPT: + // Issue async request for fuzzy checkpoint + assert(!checkpoint_.failed); + if(CheckpointFuzzyIndex() != Status::Ok) { + checkpoint_.failed = true; + } + break; + case Phase::PREPARE: + // INDEX_CHKPT -> PREPARE + // Get an overestimate for the ofb's tail, after we've finished fuzzy-checkpointing the ofb. + // (Ensures that recovery won't accidentally reallocate from the ofb.) + checkpoint_.index_metadata.ofb_count = + overflow_buckets_allocator_[resize_info_.version].count(); + // Write index meta data on disk + if(WriteIndexMetadata() != Status::Ok) { + checkpoint_.failed = true; + } + break; + case Phase::IN_PROGRESS: { + // PREPARE -> IN_PROGRESS + // Do nothing + break; + } + case Phase::WAIT_PENDING: + // IN_PROGRESS -> WAIT_PENDING + // Do nothing + break; + case Phase::WAIT_FLUSH: { + // WAIT_PENDING -> WAIT_FLUSH + if(fold_over_snapshot) { + // Move read-only to tail + Address tail_address = hlog.ShiftReadOnlyToTail(); + // Get final address for CPR + checkpoint_.log_metadata.final_address = tail_address; + } else { + Address tail_address = hlog.GetTailAddress(); + // Get final address for CPR + checkpoint_.log_metadata.final_address = tail_address; + checkpoint_.snapshot_file = disk.NewFile(disk.relative_cpr_checkpoint_path( + checkpoint_.log_metadata.version) + "snapshot.dat"); + if(checkpoint_.snapshot_file.Open(&disk.handler()) != Status::Ok) { + checkpoint_.failed = true; + } + // Flush the log to a snapshot. + hlog.AsyncFlushPagesToFile(checkpoint_.log_metadata.flushed_address.page(), + checkpoint_.log_metadata.final_address, checkpoint_.snapshot_file, + checkpoint_.flush_pending); + } + // Write CPR meta data file + if(WriteCprMetadata() != Status::Ok) { + checkpoint_.failed = true; + } + break; + } + case Phase::PERSISTENCE_CALLBACK: + // WAIT_FLUSH -> PERSISTENCE_CALLBACK + break; + case Phase::REST: + // PERSISTENCE_CALLBACK -> REST + // All persistence callbacks have been called; we can reset the contexts now. (Have to reset + // contexts before another checkpoint can be started.) + checkpoint_.CheckpointDone(); + // Free checkpoint locks! + checkpoint_locks_.Free(); + // Checkpoint is done--no more work for threads to do. + system_state_.store(SystemState{ Action::None, Phase::REST, next_state.version }); + break; + default: + // not reached + assert(false); + break; + } + break; + case Action::GC: + switch(next_state.phase) { + case Phase::GC_IO_PENDING: + // This case is handled directly inside ShiftBeginAddress(). + assert(false); + break; + case Phase::GC_IN_PROGRESS: + // GC_IO_PENDING -> GC_IN_PROGRESS + // Tell the disk to truncate the log. + hlog.Truncate(gc_.truncate_callback); + break; + case Phase::REST: + // GC_IN_PROGRESS -> REST + // GC is done--no more work for threads to do. + if(gc_.complete_callback) { + gc_.complete_callback(); + } + system_state_.store(SystemState{ Action::None, Phase::REST, next_state.version }); + break; + default: + // not reached + assert(false); + break; + } + break; + case Action::GrowIndex: + switch(next_state.phase) { + case Phase::GROW_PREPARE: + // This case is handled directly inside GrowIndex(). + assert(false); + break; + case Phase::GROW_IN_PROGRESS: + // Swap hash table versions so that all threads will use the new version after populating it. + resize_info_.version = grow_.new_version; + break; + case Phase::REST: + if(grow_.callback) { + grow_.callback(state_[grow_.new_version].size()); + } + system_state_.store(SystemState{ Action::None, Phase::REST, next_state.version }); + break; + default: + // not reached + assert(false); + break; + } + break; + default: + // not reached + assert(false); + break; + } + return true; +} + +template +void FasterKv::MarkAllPendingRequests() { + uint32_t table_version = resize_info_.version; + uint64_t table_size = state_[table_version].size(); + + for(const IAsyncContext* ctxt : thread_ctx().retry_requests) { + const pending_context_t* context = static_cast(ctxt); + // We will succeed, since no other thread can currently advance the entry's version, since this + // thread hasn't acked "PENDING" phase completion yet. + bool result = checkpoint_locks_.get_lock(context->key().GetHash()).try_lock_old(); + assert(result); + } + for(const auto& pending_io : thread_ctx().pending_ios) { + // We will succeed, since no other thread can currently advance the entry's version, since this + // thread hasn't acked "PENDING" phase completion yet. + bool result = checkpoint_locks_.get_lock(pending_io.second).try_lock_old(); + assert(result); + } +} + +template +void FasterKv::HandleSpecialPhases() { + SystemState final_state = system_state_.load(); + if(final_state.phase == Phase::REST) { + // Nothing to do; just reset thread context. + thread_ctx().phase = Phase::REST; + thread_ctx().version = final_state.version; + return; + } + SystemState previous_state{ final_state.action, thread_ctx().phase, thread_ctx().version }; + do { + // Identify the transition (currentState -> nextState) + SystemState current_state = (previous_state == final_state) ? final_state : + previous_state.GetNextState(); + switch(current_state.action) { + case Action::Checkpoint: + switch(current_state.phase) { + case Phase::PREP_INDEX_CHKPT: + // Both from REST -> PREP_INDEX_CHKPT and PREP_INDEX_CHKPT -> PREP_INDEX_CHKPT + if(previous_state.phase == Phase::REST) { + // Thread ack that we're performing a checkpoint. + if(epoch_.FinishThreadPhase(Phase::PREP_INDEX_CHKPT)) { + GlobalMoveToNextState(current_state); + } + } + break; + case Phase::INDEX_CHKPT: { + // Both from PREP_INDEX_CHKPT -> INDEX_CHKPT and INDEX_CHKPT -> INDEX_CHKPT + Status result = CheckpointFuzzyIndexComplete(); + if(result != Status::Pending && result != Status::Ok) { + checkpoint_.failed = true; + } + if(result != Status::Pending) { + GlobalMoveToNextState(current_state); + } + break; + } + case Phase::PREPARE: + // Handle INDEX_CHKPT -> PREPARE and PREPARE -> PREPARE + if(previous_state.phase == Phase::INDEX_CHKPT) { + // mark pending requests + MarkAllPendingRequests(); + // keep a count of number of threads + ++checkpoint_.log_metadata.num_threads; + // set the thread index + checkpoint_.log_metadata.guids[Thread::id()] = thread_ctx().guid; + // Thread ack that it has finished marking its pending requests. + if(epoch_.FinishThreadPhase(Phase::PREPARE)) { + GlobalMoveToNextState(current_state); + } + } + break; + case Phase::IN_PROGRESS: + // Handle PREPARE -> IN_PROGRESS and IN_PROGRESS -> IN_PROGRESS + if(previous_state.phase == Phase::PREPARE) { + assert(prev_thread_ctx().retry_requests.empty()); + assert(prev_thread_ctx().pending_ios.empty()); + assert(prev_thread_ctx().io_responses.empty()); + + // Get a new thread context; keep track of the old one as "previous." + thread_contexts_[Thread::id()].swap(); + // initialize a new local context + thread_ctx().Initialize(Phase::IN_PROGRESS, current_state.version, + prev_thread_ctx().guid, prev_thread_ctx().serial_num); + // Thread ack that it has swapped contexts. + if(epoch_.FinishThreadPhase(Phase::IN_PROGRESS)) { + GlobalMoveToNextState(current_state); + } + } + break; + case Phase::WAIT_PENDING: + // Handle IN_PROGRESS -> WAIT_PENDING and WAIT_PENDING -> WAIT_PENDING + if(!epoch_.HasThreadFinishedPhase(Phase::WAIT_PENDING)) { + if(prev_thread_ctx().pending_ios.empty() && + prev_thread_ctx().retry_requests.empty()) { + // Thread ack that it has completed its pending I/Os. + if(epoch_.FinishThreadPhase(Phase::WAIT_PENDING)) { + GlobalMoveToNextState(current_state); + } + } + } + break; + case Phase::WAIT_FLUSH: + // Handle WAIT_PENDING -> WAIT_FLUSH and WAIT_FLUSH -> WAIT_FLUSH + if(!epoch_.HasThreadFinishedPhase(Phase::WAIT_FLUSH)) { + bool flushed; + if(fold_over_snapshot) { + flushed = hlog.flushed_until_address.load() >= checkpoint_.log_metadata.final_address; + } else { + flushed = checkpoint_.flush_pending.load() == 0; + } + if(flushed) { + // write context info + WriteCprContext(); + // Thread ack that it has written its CPU context. + if(epoch_.FinishThreadPhase(Phase::WAIT_FLUSH)) { + GlobalMoveToNextState(current_state); + } + } + } + break; + case Phase::PERSISTENCE_CALLBACK: + // Handle WAIT_FLUSH -> PERSISTENCE_CALLBACK and PERSISTENCE_CALLBACK -> PERSISTENCE_CALLBACK + if(previous_state.phase == Phase::WAIT_FLUSH) { + // Persistence callback + if(checkpoint_.persistence_callback) { + checkpoint_.persistence_callback(prev_thread_ctx().serial_num); + } + // Thread has finished checkpointing. + thread_ctx().phase = Phase::REST; + // Thread ack that it has finished checkpointing. + if(epoch_.FinishThreadPhase(Phase::PERSISTENCE_CALLBACK)) { + GlobalMoveToNextState(current_state); + } + } + break; + default: + // nothing to do. + break; + } + break; + case Action::GC: + switch(current_state.phase) { + case Phase::GC_IO_PENDING: + // Handle REST -> GC_IO_PENDING and GC_IO_PENDING -> GC_IO_PENDING. + if(previous_state.phase == Phase::REST) { + assert(prev_thread_ctx().retry_requests.empty()); + assert(prev_thread_ctx().pending_ios.empty()); + assert(prev_thread_ctx().io_responses.empty()); + // Get a new thread context; keep track of the old one as "previous." + thread_contexts_[Thread::id()].swap(); + // initialize a new local context + thread_ctx().Initialize(Phase::GC_IO_PENDING, current_state.version, + prev_thread_ctx().guid, prev_thread_ctx().serial_num); + } + + // See if the old thread context has completed its pending I/Os. + if(!epoch_.HasThreadFinishedPhase(Phase::GC_IO_PENDING)) { + if(prev_thread_ctx().pending_ios.empty() && + prev_thread_ctx().retry_requests.empty()) { + // Thread ack that it has completed its pending I/Os. + if(epoch_.FinishThreadPhase(Phase::GC_IO_PENDING)) { + GlobalMoveToNextState(current_state); + } + } + } + break; + case Phase::GC_IN_PROGRESS: + // Handle GC_IO_PENDING -> GC_IN_PROGRESS and GC_IN_PROGRESS -> GC_IN_PROGRESS. + if(!epoch_.HasThreadFinishedPhase(Phase::GC_IN_PROGRESS)) { + if(!CleanHashTableBuckets()) { + // No more buckets for this thread to clean; thread has finished GC. + thread_ctx().phase = Phase::REST; + // Thread ack that it has finished GC. + if(epoch_.FinishThreadPhase(Phase::GC_IN_PROGRESS)) { + GlobalMoveToNextState(current_state); + } + } + } + break; + default: + assert(false); // not reached + break; + } + break; + case Action::GrowIndex: + switch(current_state.phase) { + case Phase::GROW_PREPARE: + if(previous_state.phase == Phase::REST) { + // Thread ack that we're going to grow the hash table. + if(epoch_.FinishThreadPhase(Phase::GROW_PREPARE)) { + GlobalMoveToNextState(current_state); + } + } else { + // Wait for all other threads to finish their outstanding (synchronous) hash table + // operations. + std::this_thread::yield(); + } + break; + case Phase::GROW_IN_PROGRESS: + SplitHashTableBuckets(); + break; + } + break; + } + thread_ctx().phase = current_state.phase; + thread_ctx().version = current_state.version; + previous_state = current_state; + } while(previous_state != final_state); +} + +template +bool FasterKv::Checkpoint(void(*persistence_callback)(uint64_t persistent_serial_num)) { + // Only one thread can initiate a checkpoint at a time. (This assumption is implicit in the C# + /// version, and explicit here.) + SystemState expected{ Action::None, Phase::REST, system_state_.load().version }; + SystemState desired{ Action::Checkpoint, Phase::REST, expected.version }; + if(!system_state_.compare_exchange_strong(expected, desired)) { + // Can't start a new checkpoint while a checkpoint or recovery is already in progress. + return false; + } + // We are going to start a checkpoint. + epoch_.ResetPhaseFinished(); + // Initialize all contexts + disk.CreateIndexCheckpointDirectory(desired.version); + disk.CreateCprCheckpointDirectory(desired.version); + // Obtain tail address for fuzzy index checkpoint + if(!fold_over_snapshot) { + checkpoint_.InitializeCheckpoint(desired.version, state_[resize_info_.version].size(), + hlog.begin_address.load(), hlog.GetTailAddress(), true, + hlog.flushed_until_address.load(), persistence_callback); + } else { + checkpoint_.InitializeCheckpoint(desired.version, state_[resize_info_.version].size(), + hlog.begin_address.load(), hlog.GetTailAddress(), false, + Address::kInvalidAddress, persistence_callback); + } + InitializeCheckpointLocks(); + // Let other threads know that the checkpoint has started. + system_state_.store(desired.GetNextState()); + return true; +} + +template +Status FasterKv::Recover(uint32_t cpr_version, uint32_t index_version, + std::vector& session_ids) { + session_ids.clear(); + SystemState expected = SystemState{ Action::None, Phase::REST, system_state_.load().version }; + if(!system_state_.compare_exchange_strong(expected, + SystemState{ Action::Recover, Phase::REST, expected.version })) { + return Status::Aborted; + } + checkpoint_.InitializeRecover(); + Status status; +#define BREAK_NOT_OK(s) \ + status = (s); \ + if (status != Status::Ok) break \ + + do { + // Index and log metadata. + BREAK_NOT_OK(ReadIndexMetadata(index_version)); + BREAK_NOT_OK(ReadCprMetadata(cpr_version)); + system_state_.store(SystemState{ Action::Recover, Phase::REST, cpr_version + 1 }); + + BREAK_NOT_OK(ReadCprContexts(cpr_version, checkpoint_.log_metadata.guids)); + // The index itself (including overflow buckets). + BREAK_NOT_OK(RecoverFuzzyIndex()); + BREAK_NOT_OK(RecoverFuzzyIndexComplete(true)); + // Any changes made to the log while the index was being fuzzy-checkpointed. + if(fold_over_snapshot) { + BREAK_NOT_OK(RecoverHybridLog()); + } else { + BREAK_NOT_OK(RecoverHybridLogFromSnapshotFile()); + } + BREAK_NOT_OK(RestoreHybridLog()); + } while(false); + if(status == Status::Ok) { + for(const auto& token : checkpoint_.continue_tokens) { + session_ids.push_back(token.first); + } + } + checkpoint_.RecoverDone(); + system_state_.store(SystemState{ Action::None, Phase::REST, cpr_version + 1 }); + return status; +#undef BREAK_NOT_OK +} + +template +bool FasterKv::ShiftBeginAddress(Address address, + GcState::truncate_callback_t truncate_callback, + GcState::complete_callback_t complete_callback) { + SystemState expected = SystemState{ Action::None, Phase::REST, system_state_.load().version }; + if(!system_state_.compare_exchange_strong(expected, + SystemState{ Action::GC, Phase::REST, expected.version })) { + // Can't start a GC while an action is already in progress. + return false; + } + hlog.begin_address.store(address); + // Each active thread will notify the epoch when all pending I/Os have completed. + epoch_.ResetPhaseFinished(); + uint64_t num_chunks = std::max(state_[resize_info_.version].size() / kGcHashTableChunkSize, + (uint64_t)1); + gc_.Initialize(truncate_callback, complete_callback, num_chunks); + // Let other threads know to complete their pending I/Os, so that the log can be truncated. + system_state_.store(SystemState{ Action::GC, Phase::GC_IO_PENDING, expected.version }); + return true; +} + +template +bool FasterKv::GrowIndex(GrowState::callback_t caller_callback) { + SystemState expected = SystemState{ Action::None, Phase::REST, system_state_.load().version }; + if(!system_state_.compare_exchange_strong(expected, + SystemState{ Action::GrowIndex, Phase::REST, expected.version })) { + // An action is already in progress. + return false; + } + epoch_.ResetPhaseFinished(); + uint8_t current_version = resize_info_.version; + assert(current_version == 0 || current_version == 1); + uint8_t next_version = 1 - current_version; + uint64_t num_chunks = std::max(state_[current_version].size() / kGrowHashTableChunkSize, + (uint64_t)1); + grow_.Initialize(caller_callback, current_version, num_chunks); + // Initialize the next version of our hash table to be twice the size of the current version. + state_[next_version].Initialize(state_[current_version].size() * 2, disk.log().alignment()); + overflow_buckets_allocator_[next_version].Initialize(disk.log().alignment(), epoch_); + + SystemState next = SystemState{ Action::GrowIndex, Phase::GROW_PREPARE, expected.version }; + system_state_.store(next); + + // Let this thread know it should be growing the index. + Refresh(); + return true; +} + +} +} // namespace FASTER::core \ No newline at end of file diff --git a/cc/src/core/gc_state.h b/cc/src/core/gc_state.h new file mode 100644 index 000000000..4b7a74251 --- /dev/null +++ b/cc/src/core/gc_state.h @@ -0,0 +1,40 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include +#include + +namespace FASTER { +namespace core { + +/// State of the active garbage-collection call. +class GcState { + public: + typedef void(*truncate_callback_t)(uint64_t offset); + typedef void(*complete_callback_t)(void); + + GcState() + : truncate_callback{ nullptr } + , complete_callback{ nullptr } + , num_chunks{ 0 } + , next_chunk{ 0 } { + } + + void Initialize(truncate_callback_t truncate_callback_, complete_callback_t complete_callback_, + uint64_t num_chunks_) { + truncate_callback = truncate_callback_; + complete_callback = complete_callback_; + num_chunks = num_chunks_; + next_chunk = 0; + } + + truncate_callback_t truncate_callback; + complete_callback_t complete_callback; + uint64_t num_chunks; + std::atomic next_chunk; +}; + +} +} // namespace FASTER::core diff --git a/cc/src/core/grow_state.h b/cc/src/core/grow_state.h new file mode 100644 index 000000000..fd64860c3 --- /dev/null +++ b/cc/src/core/grow_state.h @@ -0,0 +1,44 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include +#include +#include + +namespace FASTER { +namespace core { + +/// State of the active grow-index call. +class GrowState { + public: + typedef void(*callback_t)(uint64_t new_size); + + GrowState() + : callback{ nullptr } + , num_pending_chunks{ 0 } + , old_version{ UINT8_MAX } + , new_version{ UINT8_MAX } { + } + + void Initialize(callback_t callback_, uint8_t current_version, uint64_t num_chunks_) { + callback = callback_; + assert(current_version == 0 || current_version == 1); + old_version = current_version; + new_version = 1 - current_version; + num_chunks = num_chunks_; + num_pending_chunks = num_chunks_; + next_chunk = 0; + } + + callback_t callback; + uint8_t old_version; + uint8_t new_version; + uint64_t num_chunks; + std::atomic num_pending_chunks; + std::atomic next_chunk; +}; + +} +} // namespace FASTER::core diff --git a/cc/src/core/guid.h b/cc/src/core/guid.h new file mode 100644 index 000000000..a41f5818f --- /dev/null +++ b/cc/src/core/guid.h @@ -0,0 +1,142 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include +#include +#include +#include +#include + +#ifdef _WIN32 +#define NOMINMAX +#define _WINSOCKAPI_ +#include +#else +#include +#endif + +namespace FASTER { +namespace core { + +/// Wrapper for GUIDs, for Windows and Linux. +class Guid { + public: +#ifdef _WIN32 + Guid() { + guid_.Data1 = 0; + guid_.Data2 = 0; + guid_.Data3 = 0; + std::memset(guid_.Data4, 0, 8); + } +#else + Guid() { + uuid_clear(uuid_); + } +#endif + + private: +#ifdef _WIN32 + Guid(const GUID& guid) + : guid_{ guid } { + } +#else + Guid(const uuid_t uuid) { + uuid_copy(uuid_, uuid); + } +#endif + + public: +#ifdef _WIN32 + static Guid Create() { + GUID guid; + HRESULT result = ::CoCreateGuid(&guid); + assert(result == S_OK); + return guid; + } +#else + static Guid Create() { + uuid_t uuid; + uuid_generate(uuid); + return uuid; + } +#endif + +#ifdef _WIN32 + std::string ToString() const { + char buffer[37]; + size_t offset = sprintf(buffer, "%.8lX-%.4hX-%.4hX-", guid_.Data1, guid_.Data2, guid_.Data3); + for(size_t idx = 0; idx < 2; ++idx) { + offset += sprintf(buffer + offset, "%.2hhX", guid_.Data4[idx]); + } + offset += sprintf(buffer + offset, "-"); + for(size_t idx = 2; idx < sizeof(guid_.Data4); ++idx) { + offset += sprintf(buffer + offset, "%.2hhX", guid_.Data4[idx]); + } + buffer[36] = '\0'; + return std::string{ buffer }; + } +#else + std::string ToString() const { + char buffer[37]; + uuid_unparse(uuid_, buffer); + return std::string{ buffer }; + } +#endif + +#ifdef _WIN32 + bool operator==(const Guid& other) const { + return guid_.Data1 == other.guid_.Data1 && + guid_.Data2 == other.guid_.Data2 && + guid_.Data3 == other.guid_.Data3 && + std::memcmp(guid_.Data4, other.guid_.Data4, 8) == 0; + } +#else + bool operator==(const Guid& other) const { + return uuid_compare(uuid_, other.uuid_) == 0; + } +#endif + +#ifdef _WIN32 + uint32_t GetHashCode() const { + // From C#, .NET Reference Framework. + return guid_.Data1 ^ ((static_cast(guid_.Data2) << 16) | + static_cast(guid_.Data3)) ^ + ((static_cast(guid_.Data4[2]) << 24) | guid_.Data4[7]); + } +#else + uint32_t GetHashCode() const { + uint32_t Data1; + uint16_t Data2; + uint16_t Data3; + std::memcpy(&Data1, uuid_, sizeof(Data1)); + std::memcpy(&Data2, uuid_ + 4, sizeof(Data2)); + std::memcpy(&Data3, uuid_ + 6, sizeof(Data3)); + // From C#, .NET Reference Framework. + return Data1 ^ ((static_cast(Data2) << 16) | + static_cast(Data3)) ^ + ((static_cast(uuid_[10]) << 24) | uuid_[15]); + } +#endif + + private: +#ifdef _WIN32 + GUID guid_; +#else + uuid_t uuid_; +#endif +}; + +} +} // namespace FASTER::core + +/// Implement std::hash<> for GUIDs. +namespace std { +template<> +struct hash { + size_t operator()(const FASTER::core::Guid& val) const { + return val.GetHashCode(); + } +}; +} diff --git a/cc/src/core/hash_bucket.h b/cc/src/core/hash_bucket.h new file mode 100644 index 000000000..081e8de14 --- /dev/null +++ b/cc/src/core/hash_bucket.h @@ -0,0 +1,201 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include +#include +#include +#include + +#include "address.h" +#include "constants.h" +#include "malloc_fixed_page_size.h" + +namespace FASTER { +namespace core { + +static_assert(Address::kAddressBits == 48, "Address::kAddressBits != 48"); + +/// Entry stored in a hash bucket. Packed into 8 bytes. +struct HashBucketEntry { + /// Invalid value in the hash table + static constexpr uint64_t kInvalidEntry = 0; + + HashBucketEntry() + : control_{ 0 } { + } + HashBucketEntry(Address address, uint16_t tag, bool tentative) + : address_{ address.control() } + , tag_{ tag } + , reserved_{ 0 } + , tentative_{ tentative } { + } + HashBucketEntry(uint64_t code) + : control_{ code } { + } + HashBucketEntry(const HashBucketEntry& other) + : control_{ other.control_ } { + } + + inline HashBucketEntry& operator=(const HashBucketEntry& other) { + control_ = other.control_; + return *this; + } + inline bool operator ==(const HashBucketEntry& other) const { + return control_ == other.control_; + } + inline bool operator !=(const HashBucketEntry& other) const { + return control_ != other.control_; + } + inline bool unused() const { + return control_ == 0; + } + inline Address address() const { + return Address{ address_ }; + } + inline uint16_t tag() const { + return static_cast(tag_); + } + inline bool tentative() const { + return static_cast(tentative_); + } + inline void set_tentative(bool desired) { + tentative_ = desired; + } + + union { + struct { + uint64_t address_ : 48; // corresponds to logical address + uint64_t tag_ : 14; + uint64_t reserved_ : 1; + uint64_t tentative_ : 1; + }; + uint64_t control_; + }; +}; +static_assert(sizeof(HashBucketEntry) == 8, "sizeof(HashBucketEntry) != 8"); + +/// Atomic hash-bucket entry. +class AtomicHashBucketEntry { + public: + AtomicHashBucketEntry(const HashBucketEntry& entry) + : control_{ entry.control_ } { + } + /// Default constructor + AtomicHashBucketEntry() + : control_{ HashBucketEntry::kInvalidEntry } { + } + + /// Atomic access. + inline HashBucketEntry load() const { + return HashBucketEntry{ control_.load() }; + } + inline void store(const HashBucketEntry& desired) { + control_.store(desired.control_); + } + inline bool compare_exchange_strong(HashBucketEntry& expected, HashBucketEntry desired) { + uint64_t expected_control = expected.control_; + bool result = control_.compare_exchange_strong(expected_control, desired.control_); + expected = HashBucketEntry{ expected_control }; + return result; + } + + private: + /// Atomic address to the hash bucket entry. + std::atomic control_; +}; + +/// Entry stored in a hash bucket that points to the next overflow bucket (if any). +struct HashBucketOverflowEntry { + HashBucketOverflowEntry() + : control_{ 0 } { + } + HashBucketOverflowEntry(FixedPageAddress address) + : address_{ address.control() } + , unused_{ 0 } { + } + HashBucketOverflowEntry(const HashBucketOverflowEntry& other) + : control_{ other.control_ } { + } + HashBucketOverflowEntry(uint64_t code) + : control_{ code } { + } + + inline HashBucketOverflowEntry& operator=(const HashBucketOverflowEntry& other) { + control_ = other.control_; + return *this; + } + inline bool operator ==(const HashBucketOverflowEntry& other) const { + return control_ == other.control_; + } + inline bool operator !=(const HashBucketOverflowEntry& other) const { + return control_ != other.control_; + } + inline bool unused() const { + return address_ == 0; + } + inline FixedPageAddress address() const { + return FixedPageAddress{ address_ }; + } + + union { + struct { + uint64_t address_ : 48; // corresponds to logical address + uint64_t unused_ : 16; + }; + uint64_t control_; + }; +}; +static_assert(sizeof(HashBucketOverflowEntry) == 8, "sizeof(HashBucketOverflowEntry) != 8"); + +/// Atomic hash-bucket overflow entry. +class AtomicHashBucketOverflowEntry { + private: + static constexpr uint64_t kPinIncrement = (uint64_t)1 << 48; + static constexpr uint64_t kLocked = (uint64_t)1 << 63; + + public: + AtomicHashBucketOverflowEntry(const HashBucketOverflowEntry& entry) + : control_{ entry.control_ } { + } + /// Default constructor + AtomicHashBucketOverflowEntry() + : control_{ HashBucketEntry::kInvalidEntry } { + } + + /// Atomic access. + inline HashBucketOverflowEntry load() const { + return HashBucketOverflowEntry{ control_.load() }; + } + inline void store(const HashBucketOverflowEntry& desired) { + control_.store(desired.control_); + } + inline bool compare_exchange_strong(HashBucketOverflowEntry& expected, + HashBucketOverflowEntry desired) { + uint64_t expected_control = expected.control_; + bool result = control_.compare_exchange_strong(expected_control, desired.control_); + expected = HashBucketOverflowEntry{ expected_control }; + return result; + } + + private: + /// Atomic address to the hash bucket entry. + std::atomic control_; +}; + +/// A bucket consisting of 7 hash bucket entries, plus one hash bucket overflow entry. Fits in +/// a cache line. +struct alignas(Constants::kCacheLineBytes) HashBucket { + /// Number of entries per bucket (excluding overflow entry). + static constexpr uint32_t kNumEntries = 7; + /// The entries. + AtomicHashBucketEntry entries[kNumEntries]; + /// Overflow entry points to next overflow bucket, if any. + AtomicHashBucketOverflowEntry overflow_entry; +}; +static_assert(sizeof(HashBucket) == Constants::kCacheLineBytes, + "sizeof(HashBucket) != Constants::kCacheLineBytes"); + +} +} // namespace FASTER::core diff --git a/cc/src/core/hash_table.h b/cc/src/core/hash_table.h new file mode 100644 index 000000000..4fc05a482 --- /dev/null +++ b/cc/src/core/hash_table.h @@ -0,0 +1,294 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include +#include +#include + +#include "hash_bucket.h" +#include "key_hash.h" + +namespace FASTER { +namespace core { + +/// The hash table itself: a sized array of HashBuckets. +template +class InternalHashTable { + public: + typedef D disk_t; + typedef typename D::file_t file_t; + + InternalHashTable() + : size_{ 0 } + , buckets_{ nullptr } + , disk_{ nullptr } + , pending_checkpoint_writes_{ 0 } + , pending_recover_reads_{ 0 } + , checkpoint_pending_{ false } + , checkpoint_failed_{ false } + , recover_pending_{ false } + , recover_failed_{ false } { + } + + ~InternalHashTable() { + if(buckets_) { + aligned_free(buckets_); + } + } + + inline void Initialize(uint64_t new_size, uint64_t alignment) { + assert(new_size < INT32_MAX); + assert(Utility::IsPowerOfTwo(new_size)); + assert(Utility::IsPowerOfTwo(alignment)); + assert(alignment >= Constants::kCacheLineBytes); + if(size_ != new_size) { + size_ = new_size; + if(buckets_) { + aligned_free(buckets_); + } + buckets_ = reinterpret_cast(aligned_alloc(alignment, + size_ * sizeof(HashBucket))); + } + std::memset(buckets_, 0, size_ * sizeof(HashBucket)); + assert(pending_checkpoint_writes_ == 0); + assert(pending_recover_reads_ == 0); + assert(checkpoint_pending_ == false); + assert(checkpoint_failed_ == false); + assert(recover_pending_ == false); + assert(recover_failed_ == false); + } + + inline void Uninitialize() { + if(buckets_) { + aligned_free(buckets_); + buckets_ = nullptr; + } + size_ = 0; + assert(pending_checkpoint_writes_ == 0); + assert(pending_recover_reads_ == 0); + assert(checkpoint_pending_ == false); + assert(checkpoint_failed_ == false); + assert(recover_pending_ == false); + assert(recover_failed_ == false); + } + + /// Get the bucket specified by the hash. + inline const HashBucket& bucket(KeyHash hash) const { + return buckets_[hash.idx(size_)]; + } + inline HashBucket& bucket(KeyHash hash) { + return buckets_[hash.idx(size_)]; + } + + /// Get the bucket specified by the index. (Used by checkpoint/recovery.) + inline const HashBucket& bucket(uint64_t idx) const { + assert(idx < size_); + return buckets_[idx]; + } + /// (Used by GC and called by unit tests.) + inline HashBucket& bucket(uint64_t idx) { + assert(idx < size_); + return buckets_[idx]; + } + + inline uint64_t size() const { + return size_; + } + + // Checkpointing and recovery. + Status Checkpoint(disk_t& disk, file_t&& file, uint64_t& checkpoint_size); + inline Status CheckpointComplete(bool wait); + + Status Recover(disk_t& disk, file_t&& file, uint64_t checkpoint_size); + inline Status RecoverComplete(bool wait); + + void DumpDistribution(MallocFixedPageSize& overflow_buckets_allocator); + + private: + // Checkpointing and recovery. + class AsyncIoContext : public IAsyncContext { + public: + AsyncIoContext(InternalHashTable* table_) + : table{ table_ } { + } + /// The deep-copy constructor + AsyncIoContext(AsyncIoContext& other) + : table{ other.table } { + } + protected: + Status DeepCopy_Internal(IAsyncContext*& context_copy) final { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + public: + InternalHashTable* table; + }; + + private: + uint64_t size_; + HashBucket* buckets_; + + /// State for ongoing checkpoint/recovery. + disk_t* disk_; + file_t file_; + std::atomic pending_checkpoint_writes_; + std::atomic pending_recover_reads_; + std::atomic checkpoint_pending_; + std::atomic checkpoint_failed_; + std::atomic recover_pending_; + std::atomic recover_failed_; +}; + +/// Implementations. +template +Status InternalHashTable::Checkpoint(disk_t& disk, file_t&& file, uint64_t& checkpoint_size) { + auto callback = [](IAsyncContext* ctxt, Status result, size_t bytes_transferred) { + CallbackContext context{ ctxt }; + if(result != Status::Ok) { + context->table->checkpoint_failed_ = true; + } + if(--context->table->pending_checkpoint_writes_ == 0) { + result = context->table->file_.Close(); + if(result != Status::Ok) { + context->table->checkpoint_failed_ = true; + } + context->table->checkpoint_pending_ = false; + } + }; + + assert(size_ % Constants::kNumMergeChunks == 0); + disk_ = &disk; + file_ = std::move(file); + + checkpoint_size = 0; + checkpoint_failed_ = false; + uint32_t chunk_size = static_cast(size_ / Constants::kNumMergeChunks); + uint32_t write_size = static_cast(chunk_size * sizeof(HashBucket)); + assert(write_size % file_.alignment() == 0); + assert(!checkpoint_pending_); + assert(pending_checkpoint_writes_ == 0); + checkpoint_pending_ = true; + pending_checkpoint_writes_ = Constants::kNumMergeChunks; + for(uint32_t idx = 0; idx < Constants::kNumMergeChunks; ++idx) { + AsyncIoContext context{ this }; + RETURN_NOT_OK(file_.WriteAsync(&bucket(idx * chunk_size), idx * write_size, write_size, + callback, context)); + } + checkpoint_size = size_ * sizeof(HashBucket); + return Status::Ok; +} + +template +inline Status InternalHashTable::CheckpointComplete(bool wait) { + disk_->TryComplete(); + bool complete = !checkpoint_pending_.load(); + while(wait && !complete) { + disk_->TryComplete(); + complete = !checkpoint_pending_.load(); + std::this_thread::yield(); + } + if(!complete) { + return Status::Pending; + } else { + return checkpoint_failed_ ? Status::IOError : Status::Ok; + } +} + +template +Status InternalHashTable::Recover(disk_t& disk, file_t&& file, uint64_t checkpoint_size) { + auto callback = [](IAsyncContext* ctxt, Status result, size_t bytes_transferred) { + CallbackContext context{ ctxt }; + if(result != Status::Ok) { + context->table->recover_failed_ = true; + } + if(--context->table->pending_recover_reads_ == 0) { + result = context->table->file_.Close(); + if(result != Status::Ok) { + context->table->recover_failed_ = true; + } + context->table->recover_pending_ = false; + } + }; + + assert(checkpoint_size > 0); + assert(checkpoint_size % sizeof(HashBucket) == 0); + assert(checkpoint_size % Constants::kNumMergeChunks == 0); + disk_ = &disk; + file_ = std::move(file); + + recover_failed_ = false; + uint32_t read_size = static_cast(checkpoint_size / Constants::kNumMergeChunks); + uint32_t chunk_size = static_cast(read_size / sizeof(HashBucket)); + assert(read_size % file_.alignment() == 0); + + Initialize(checkpoint_size / sizeof(HashBucket), file_.alignment()); + assert(!recover_pending_); + assert(pending_recover_reads_.load() == 0); + recover_pending_ = true; + pending_recover_reads_ = Constants::kNumMergeChunks; + for(uint32_t idx = 0; idx < Constants::kNumMergeChunks; ++idx) { + AsyncIoContext context{ this }; + RETURN_NOT_OK(file_.ReadAsync(idx * read_size, &bucket(idx * chunk_size), read_size, + callback, context)); + } + return Status::Ok; +} + +template +inline Status InternalHashTable::RecoverComplete(bool wait) { + disk_->TryComplete(); + bool complete = !recover_pending_.load(); + while(wait && !complete) { + disk_->TryComplete(); + complete = !recover_pending_.load(); + std::this_thread::yield(); + } + if(!complete) { + return Status::Pending; + } else { + return recover_failed_ ? Status::IOError : Status::Ok; + } +} + +template +inline void InternalHashTable::DumpDistribution( + MallocFixedPageSize& overflow_buckets_allocator) { + uint64_t table_size = size(); + uint64_t total_record_count = 0; + uint64_t histogram[16] = { 0 }; + for(uint64_t bucket_idx = 0; bucket_idx < table_size; ++bucket_idx) { + const HashBucket* bucket = &buckets_[bucket_idx]; + uint64_t count = 0; + while(bucket) { + for(uint32_t entry_idx = 0; entry_idx < HashBucket::kNumEntries; ++entry_idx) { + if(!bucket->entries[entry_idx].load().unused()) { + ++count; + ++total_record_count; + } + } + HashBucketOverflowEntry overflow_entry = bucket->overflow_entry.load(); + if(overflow_entry.unused()) { + bucket = nullptr; + } else { + bucket = &overflow_buckets_allocator.Get(overflow_entry.address()); + } + } + if(count < 15) { + ++histogram[count]; + } else { + ++histogram[15]; + } + } + + printf("number of hash buckets: %" PRIu64 "\n", table_size); + printf("total record count: %" PRIu64 "\n", total_record_count); + printf("histogram:\n"); + for(uint8_t idx = 0; idx < 15; ++idx) { + printf("%2u : %" PRIu64 "\n", idx, histogram[idx]); + } + printf("15+: %" PRIu64 "\n", histogram[15]); +} + +} +} // namespace FASTER::core diff --git a/cc/src/core/internal_contexts.h b/cc/src/core/internal_contexts.h new file mode 100644 index 000000000..8a0cfd787 --- /dev/null +++ b/cc/src/core/internal_contexts.h @@ -0,0 +1,379 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include "address.h" +#include "guid.h" +#include "hash_bucket.h" +#include "native_buffer_pool.h" +#include "record.h" +#include "state_transitions.h" +#include "thread.h" + +namespace FASTER { +namespace core { + +/// Internal contexts, used by FASTER. + +enum class OperationType : uint8_t { + Read, + RMW, + Upsert, + Insert, + Delete +}; + +enum class OperationStatus : uint8_t { + SUCCESS, + NOT_FOUND, + RETRY_NOW, + RETRY_LATER, + RECORD_ON_DISK, + SUCCESS_UNMARK, + NOT_FOUND_UNMARK, + CPR_SHIFT_DETECTED +}; + +/// Internal FASTER context. +template +class PendingContext : public IAsyncContext { + public: + typedef K key_t; + + protected: + PendingContext(OperationType type_, IAsyncContext& caller_context_, + AsyncCallback caller_callback_) + : type{ type_ } + , caller_context{ &caller_context_ } + , caller_callback{ caller_callback_ } + , version{ UINT32_MAX } + , phase{ Phase::INVALID } + , result{ Status::Pending } + , address{ Address::kInvalidAddress } + , entry{ HashBucketEntry::kInvalidEntry } { + } + + public: + /// The deep-copy constructor. + PendingContext(const PendingContext& other, IAsyncContext* caller_context_) + : type{ other.type } + , caller_context{ caller_context_ } + , caller_callback{ other.caller_callback } + , version{ other.version } + , phase{ other.phase } + , result{ other.result } + , address{ other.address } + , entry{ other.entry } { + } + + public: + /// Go async, for the first time. + void go_async(Phase phase_, uint32_t version_, Address address_, HashBucketEntry entry_) { + phase = phase_; + version = version_; + address = address_; + entry = entry_; + } + + /// Go async, again. + void continue_async(Address address_, HashBucketEntry entry_) { + address = address_; + entry = entry_; + } + + virtual const key_t& key() const = 0; + + /// Caller context. + IAsyncContext* caller_context; + /// Caller callback. + AsyncCallback caller_callback; + /// Checkpoint version. + uint32_t version; + /// Checkpoint phase. + Phase phase; + /// Type of operation (Read, Upsert, RMW, etc.). + OperationType type; + /// Result of operation. + Status result; + /// Address of the record being read or modified. + Address address; + /// Hash table entry that (indirectly) leads to the record being read or modified. + HashBucketEntry entry; +}; + +/// FASTER's internal Read() context. + +/// An internal Read() context that has gone async and lost its type information. +template +class AsyncPendingReadContext : public PendingContext { + public: + typedef K key_t; + protected: + AsyncPendingReadContext(IAsyncContext& caller_context_, AsyncCallback caller_callback_) + : PendingContext(OperationType::Read, caller_context_, caller_callback_) { + } + /// The deep copy constructor. + AsyncPendingReadContext(AsyncPendingReadContext& other, IAsyncContext* caller_context) + : PendingContext(other, caller_context) { + } + public: + virtual void Get(const void* rec) = 0; + virtual void GetAtomic(const void* rec) = 0; +}; + +/// A synchronous Read() context preserves its type information. +template +class PendingReadContext : public AsyncPendingReadContext { + public: + typedef RC read_context_t; + typedef typename read_context_t::key_t key_t; + typedef typename read_context_t::value_t value_t; + typedef Record record_t; + + PendingReadContext(read_context_t& caller_context_, AsyncCallback caller_callback_) + : AsyncPendingReadContext(caller_context_, caller_callback_) { + } + /// The deep copy constructor. + PendingReadContext(PendingReadContext& other, IAsyncContext* caller_context_) + : AsyncPendingReadContext(other, caller_context_) { + } + protected: + Status DeepCopy_Internal(IAsyncContext*& context_copy) final { + return IAsyncContext::DeepCopy_Internal(*this, PendingContext::caller_context, + context_copy); + } + private: + inline const read_context_t& read_context() const { + return *static_cast(PendingContext::caller_context); + } + inline read_context_t& read_context() { + return *static_cast(PendingContext::caller_context); + } + public: + /// Accessors. + inline const key_t& key() const final { + return read_context().key(); + } + inline void Get(const void* rec) final { + const record_t* record = reinterpret_cast(rec); + read_context().Get(record->value()); + } + inline void GetAtomic(const void* rec) final { + const record_t* record = reinterpret_cast(rec); + read_context().GetAtomic(record->value()); + } +}; + +/// FASTER's internal Upsert() context. + +/// An internal Upsert() context that has gone async and lost its type information. +template +class AsyncPendingUpsertContext : public PendingContext { + public: + typedef K key_t; + protected: + AsyncPendingUpsertContext(IAsyncContext& caller_context_, AsyncCallback caller_callback_) + : PendingContext(OperationType::Upsert, caller_context_, caller_callback_) { + } + /// The deep copy constructor. + AsyncPendingUpsertContext(AsyncPendingUpsertContext& other, IAsyncContext* caller_context) + : PendingContext(other, caller_context) { + } + public: + virtual void Put(void* rec) = 0; + virtual bool PutAtomic(void* rec) = 0; + virtual uint32_t value_size() const = 0; +}; + +/// A synchronous Upsert() context preserves its type information. +template +class PendingUpsertContext : public AsyncPendingUpsertContext { + public: + typedef UC upsert_context_t; + typedef typename upsert_context_t::key_t key_t; + typedef typename upsert_context_t::value_t value_t; + typedef Record record_t; + + PendingUpsertContext(upsert_context_t& caller_context_, AsyncCallback caller_callback_) + : AsyncPendingUpsertContext(caller_context_, caller_callback_) { + } + /// The deep copy constructor. + PendingUpsertContext(PendingUpsertContext& other, IAsyncContext* caller_context_) + : AsyncPendingUpsertContext(other, caller_context_) { + } + protected: + Status DeepCopy_Internal(IAsyncContext*& context_copy) final { + return IAsyncContext::DeepCopy_Internal(*this, PendingContext::caller_context, + context_copy); + } + private: + inline const upsert_context_t& upsert_context() const { + return *static_cast(PendingContext::caller_context); + } + inline upsert_context_t& upsert_context() { + return *static_cast(PendingContext::caller_context); + } + public: + /// Accessors. + inline const key_t& key() const final { + return upsert_context().key(); + } + inline void Put(void* rec) final { + record_t* record = reinterpret_cast(rec); + upsert_context().Put(record->value()); + } + inline bool PutAtomic(void* rec) final { + record_t* record = reinterpret_cast(rec); + return upsert_context().PutAtomic(record->value()); + } + inline constexpr uint32_t value_size() const final { + return upsert_context().value_size(); + } +}; + +/// FASTER's internal Rmw() context. +/// An internal Rmw() context that has gone async and lost its type information. +template +class AsyncPendingRmwContext : public PendingContext { + public: + typedef K key_t; + protected: + AsyncPendingRmwContext(IAsyncContext& caller_context_, AsyncCallback caller_callback_) + : PendingContext(OperationType::RMW, caller_context_, caller_callback_) { + } + /// The deep copy constructor. + AsyncPendingRmwContext(AsyncPendingRmwContext& other, IAsyncContext* caller_context) + : PendingContext(other, caller_context) { + } + public: + /// Set initial value. + virtual void RmwInitial(void* rec) = 0; + /// RCU. + virtual void RmwCopy(const void* old_rec, void* rec) = 0; + /// in-place update. + virtual bool RmwAtomic(void* rec) = 0; + virtual uint32_t value_size() const = 0; +}; + +/// A synchronous Rmw() context preserves its type information. +template +class PendingRmwContext : public AsyncPendingRmwContext { + public: + typedef MC rmw_context_t; + typedef typename rmw_context_t::key_t key_t; + typedef typename rmw_context_t::value_t value_t; + typedef Record record_t; + + PendingRmwContext(rmw_context_t& caller_context_, AsyncCallback caller_callback_) + : AsyncPendingRmwContext(caller_context_, caller_callback_) { + } + /// The deep copy constructor. + PendingRmwContext(PendingRmwContext& other, IAsyncContext* caller_context_) + : AsyncPendingRmwContext(other, caller_context_) { + } + protected: + Status DeepCopy_Internal(IAsyncContext*& context_copy) final { + return IAsyncContext::DeepCopy_Internal(*this, PendingContext::caller_context, + context_copy); + } + private: + const rmw_context_t& rmw_context() const { + return *static_cast(PendingContext::caller_context); + } + rmw_context_t& rmw_context() { + return *static_cast(PendingContext::caller_context); + } + public: + /// Accessors. + const key_t& key() const { + return rmw_context().key(); + } + /// Set initial value. + inline void RmwInitial(void* rec) final { + record_t* record = reinterpret_cast(rec); + rmw_context().RmwInitial(record->value()); + } + /// RCU. + inline void RmwCopy(const void* old_rec, void* rec) final { + const record_t* old_record = reinterpret_cast(old_rec); + record_t* record = reinterpret_cast(rec); + rmw_context().RmwCopy(old_record->value(), record->value()); + } + /// in-place update. + inline bool RmwAtomic(void* rec) final { + record_t* record = reinterpret_cast(rec); + return rmw_context().RmwAtomic(record->value()); + } + inline constexpr uint32_t value_size() const final { + return rmw_context().value_size(); + } +}; + +class AsyncIOContext; + +/// Per-thread execution context. (Just the stuff that's checkpointed to disk.) +struct PersistentExecContext { + PersistentExecContext() + : serial_num{ 0 } + , version{ 0 } + , guid{} { + } + + void Initialize(uint32_t version_, const Guid& guid_, uint64_t serial_num_) { + serial_num = serial_num_; + version = version_; + guid = guid_; + } + + uint64_t serial_num; + uint32_t version; + /// Unique identifier for this session. + Guid guid; +}; +static_assert(sizeof(PersistentExecContext) == 32, "sizeof(PersistentExecContext) != 32"); + +/// Per-thread execution context. (Also includes state kept in-memory-only.) +struct ExecutionContext : public PersistentExecContext { + /// Default constructor. + ExecutionContext() + : phase{ Phase::INVALID } + , io_id{ 0 } { + } + + void Initialize(Phase phase_, uint32_t version_, const Guid& guid_, uint64_t serial_num_) { + assert(retry_requests.empty()); + assert(pending_ios.empty()); + assert(io_responses.empty()); + + PersistentExecContext::Initialize(version_, guid_, serial_num_); + phase = phase_; + retry_requests.clear(); + io_id = 0; + pending_ios.clear(); + io_responses.clear(); + } + + Phase phase; + + /// Retry request contexts are stored inside the deque. + std::deque retry_requests; + /// Assign a unique ID to every I/O request. + uint64_t io_id; + /// For each pending I/O, maps io_id to the hash of the key being retrieved. + std::unordered_map pending_ios; + + /// The I/O completion thread hands the PendingContext back to the thread that issued the + /// request. + concurrent_queue io_responses; +}; + +} +} // namespace FASTER::core diff --git a/cc/src/core/key_hash.h b/cc/src/core/key_hash.h new file mode 100644 index 000000000..5d3521b2a --- /dev/null +++ b/cc/src/core/key_hash.h @@ -0,0 +1,54 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include +#include "utility.h" + +namespace FASTER { +namespace core { + +/// Hash of a key is 8 bytes, compatible with hash bucket entry. +struct KeyHash { + KeyHash() + : control_{ 0 } { + } + explicit KeyHash(uint64_t code) + : control_{ code } { + } + KeyHash(const KeyHash& other) + : control_{ other.control_ } { + } + + KeyHash& operator=(const KeyHash& other) { + control_ = other.control_; + } + + /// Truncate the key hash's address to get the page_index into a hash table of specified size. + inline uint64_t idx(uint64_t size) const { + assert(Utility::IsPowerOfTwo(size)); + return address_ & (size - 1); + } + + /// The tag (14 bits) serves as a discriminator inside a hash bucket. (Hash buckets use 2 bits + /// for control and 48 bits for log-structured store offset; the remaining 14 bits discriminate + /// between different key hashes stored in the same bucket.) + inline uint16_t tag() const { + return static_cast(tag_); + } + + private: + union { + struct { + uint64_t address_ : 48; + uint64_t tag_ : 14; + uint64_t not_used_ : 2; + }; + uint64_t control_; + }; +}; +static_assert(sizeof(KeyHash) == 8, "sizeof(KeyHash) != 8"); + +} +} // namespace FASTER::core diff --git a/cc/src/core/light_epoch.h b/cc/src/core/light_epoch.h new file mode 100644 index 000000000..f08219191 --- /dev/null +++ b/cc/src/core/light_epoch.h @@ -0,0 +1,328 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "alloc.h" +#include "async.h" +#include "constants.h" +#include "phase.h" +#include "thread.h" +#include "utility.h" + +namespace FASTER { +namespace core { + +class LightEpoch { + private: + /// Entry in epoch table + struct alignas(Constants::kCacheLineBytes) Entry { + Entry() + : local_current_epoch{ 0 } + , reentrant{ 0 } + , phase_finished{ Phase::REST } { + } + + uint64_t local_current_epoch; + uint32_t reentrant; + std::atomic phase_finished; + }; + static_assert(sizeof(Entry) == 64, "sizeof(Entry) != 64"); + + struct EpochAction { + typedef void(*callback_t)(IAsyncContext*); + + static constexpr uint64_t kFree = UINT64_MAX; + static constexpr uint64_t kLocked = UINT64_MAX - 1; + + EpochAction() + : epoch{ kFree } + , callback{ nullptr } + , context{ nullptr } { + } + + void Initialize() { + callback = nullptr; + context = nullptr; + epoch = kFree; + } + + bool IsFree() const { + return epoch.load() == kFree; + } + + bool TryPop(uint64_t expected_epoch) { + bool retval = epoch.compare_exchange_strong(expected_epoch, kLocked); + if(retval) { + callback_t callback_ = callback; + IAsyncContext* context_ = context; + callback = nullptr; + context = nullptr; + // Release the lock. + epoch.store(kFree); + // Perform the action. + callback_(context_); + } + return retval; + } + + bool TryPush(uint64_t prior_epoch, callback_t new_callback, IAsyncContext* new_context) { + uint64_t expected_epoch = kFree; + bool retval = epoch.compare_exchange_strong(expected_epoch, kLocked); + if(retval) { + callback = new_callback; + context = new_context; + // Release the lock. + epoch.store(prior_epoch); + } + return retval; + } + + bool TrySwap(uint64_t expected_epoch, uint64_t prior_epoch, callback_t new_callback, + IAsyncContext* new_context) { + bool retval = epoch.compare_exchange_strong(expected_epoch, kLocked); + if(retval) { + callback_t existing_callback = callback; + IAsyncContext* existing_context = context; + callback = new_callback; + context = new_context; + // Release the lock. + epoch.store(prior_epoch); + // Perform the action. + existing_callback(existing_context); + } + return retval; + } + + /// The epoch field is atomic--always read it first and write it last. + std::atomic epoch; + + void(*callback)(IAsyncContext* context); + IAsyncContext* context; + }; + + public: + /// Default invalid page_index entry. + static constexpr uint32_t kInvalidIndex = 0; + /// This thread is not protecting any epoch. + static constexpr uint64_t kUnprotected = 0; + + private: + /// Default number of entries in the entries table + static constexpr uint32_t kTableSize = Thread::kMaxNumThreads; + /// Default drainlist size + static constexpr uint32_t kDrainListSize = 256; + /// Epoch table + Entry* table_; + /// Number of entries in epoch table. + uint32_t num_entries_; + + /// List of action, epoch pairs containing actions to performed when an epoch becomes + /// safe to reclaim. + EpochAction drain_list_[kDrainListSize]; + /// Count of drain actions + std::atomic drain_count_; + + public: + /// Current system epoch (global state) + std::atomic current_epoch; + /// Cached value of epoch that is safe to reclaim + std::atomic safe_to_reclaim_epoch; + + LightEpoch(uint32_t size = kTableSize) + : table_{ nullptr } + , num_entries_{ 0 } + , drain_count_{ 0 } + , drain_list_{} { + Initialize(size); + } + + ~LightEpoch() { + Uninitialize(); + } + + private: + void Initialize(uint32_t size) { + num_entries_ = size; + // do cache-line alignment + table_ = reinterpret_cast(aligned_alloc(Constants::kCacheLineBytes, + (size + 2) * sizeof(Entry))); + new(table_) Entry[size + 2]; + current_epoch = 1; + safe_to_reclaim_epoch = 0; + for(uint32_t idx = 0; idx < kDrainListSize; ++idx) { + drain_list_[idx].Initialize(); + } + drain_count_ = 0; + } + + void Uninitialize() { + aligned_free(table_); + table_ = nullptr; + num_entries_ = 0; + current_epoch = 1; + safe_to_reclaim_epoch = 0; + } + + public: + /// Enter the thread into the protected code region + inline uint64_t Protect() { + uint32_t entry = Thread::id(); + table_[entry].local_current_epoch = current_epoch.load(); + return table_[entry].local_current_epoch; + } + + /// Enter the thread into the protected code region + /// Process entries in drain list if possible + inline uint64_t ProtectAndDrain() { + uint32_t entry = Thread::id(); + table_[entry].local_current_epoch = current_epoch.load(); + if(drain_count_.load() > 0) { + Drain(table_[entry].local_current_epoch); + } + return table_[entry].local_current_epoch; + } + + uint64_t ReentrantProtect() { + uint32_t entry = Thread::id(); + if(table_[entry].local_current_epoch != kUnprotected) + return table_[entry].local_current_epoch; + table_[entry].local_current_epoch = current_epoch.load(); + table_[entry].reentrant++; + return table_[entry].local_current_epoch; + } + + inline bool IsProtected() { + uint32_t entry = Thread::id(); + return table_[entry].local_current_epoch != kUnprotected; + } + + /// Exit the thread from the protected code region. + void Unprotect() { + table_[Thread::id()].local_current_epoch = kUnprotected; + } + + void ReentrantUnprotect() { + uint32_t entry = Thread::id(); + if(--(table_[entry].reentrant) == 0) { + table_[entry].local_current_epoch = kUnprotected; + } + } + + void Drain(uint64_t nextEpoch) { + ComputeNewSafeToReclaimEpoch(nextEpoch); + for(uint32_t idx = 0; idx < kDrainListSize; ++idx) { + uint64_t trigger_epoch = drain_list_[idx].epoch.load(); + if(trigger_epoch <= safe_to_reclaim_epoch) { + if(drain_list_[idx].TryPop(trigger_epoch)) { + if(--drain_count_ == 0) { + break; + } + } + } + } + } + + /// Increment the current epoch (global system state) + uint64_t BumpCurrentEpoch() { + uint64_t nextEpoch = ++current_epoch; + if(drain_count_ > 0) { + Drain(nextEpoch); + } + return nextEpoch; + } + + /// Increment the current epoch (global system state) and register + /// a trigger action for when older epoch becomes safe to reclaim + uint64_t BumpCurrentEpoch(EpochAction::callback_t callback, IAsyncContext* context) { + uint64_t prior_epoch = BumpCurrentEpoch() - 1; + uint32_t i = 0, j = 0; + while(true) { + uint64_t trigger_epoch = drain_list_[i].epoch.load(); + if(trigger_epoch == EpochAction::kFree) { + if(drain_list_[i].TryPush(prior_epoch, callback, context)) { + ++drain_count_; + break; + } + } else if(trigger_epoch <= safe_to_reclaim_epoch.load()) { + if(drain_list_[i].TrySwap(trigger_epoch, prior_epoch, callback, context)) { + break; + } + } + if(++i == kDrainListSize) { + i = 0; + if(++j == 500) { + j = 0; + std::this_thread::sleep_for(std::chrono::seconds(1)); + fprintf(stderr, "Slowdown: Unable to add trigger to epoch\n"); + } + } + } + return prior_epoch + 1; + } + + /// Compute latest epoch that is safe to reclaim, by scanning the epoch table + uint64_t ComputeNewSafeToReclaimEpoch(uint64_t current_epoch_) { + uint64_t oldest_ongoing_call = current_epoch_; + for(uint32_t index = 1; index <= num_entries_; ++index) { + uint64_t entry_epoch = table_[index].local_current_epoch; + if(entry_epoch != kUnprotected && entry_epoch < oldest_ongoing_call) { + oldest_ongoing_call = entry_epoch; + } + } + safe_to_reclaim_epoch = oldest_ongoing_call - 1; + return safe_to_reclaim_epoch; + } + + void SpinWaitForSafeToReclaim(uint64_t current_epoch_, uint64_t safe_to_reclaim_epoch_) { + do { + ComputeNewSafeToReclaimEpoch(current_epoch_); + } while(safe_to_reclaim_epoch_ > safe_to_reclaim_epoch); + } + + bool IsSafeToReclaim(uint64_t epoch) const { + return (epoch <= safe_to_reclaim_epoch); + } + + /// CPR checkpoint functions. + inline void ResetPhaseFinished() { + for(size_t idx = 0; idx < Thread::kMaxNumThreads; ++idx) { + assert(table_[idx].phase_finished.load() == Phase::REST || + table_[idx].phase_finished.load() == Phase::PERSISTENCE_CALLBACK || + table_[idx].phase_finished.load() == Phase::GC_IN_PROGRESS || + table_[idx].phase_finished.load() == Phase::GROW_IN_PROGRESS); + table_[idx].phase_finished.store(Phase::REST); + } + } + /// This thread has completed the specified phase. + inline bool FinishThreadPhase(Phase phase) { + uint32_t entry = Thread::id(); + table_[entry].phase_finished = phase; + // Check if other threads have reported complete. + for(size_t idx = 0; idx < Thread::kMaxNumThreads; ++idx) { + Phase entry_phase = table_[idx].phase_finished.load(); + uint64_t entry_epoch = table_[idx].local_current_epoch; + if(entry_epoch != 0 && entry_phase != phase) { + return false; + } + } + return true; + } + /// Has this thread completed the specified phase (i.e., is it waiting for other threads to + /// finish the specified phase, before it can advance the global phase)? + inline bool HasThreadFinishedPhase(Phase phase) const { + uint32_t entry = Thread::id(); + return table_[entry].phase_finished == phase; + } +}; + +} +} // namespace FASTER::core diff --git a/cc/src/core/lss_allocator.cc b/cc/src/core/lss_allocator.cc new file mode 100644 index 000000000..deb85e723 --- /dev/null +++ b/cc/src/core/lss_allocator.cc @@ -0,0 +1,169 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#include +#include + +#include "alloc.h" +#include "auto_ptr.h" +#include "lss_allocator.h" +#include "thread.h" + +namespace FASTER { +namespace core { + +#define thread_index_ Thread::id() + +LssAllocator lss_allocator{}; + +namespace lss_memory { + +static_assert(sizeof(Header) < kBaseAlignment, "Unexpected header size!"); + +void SegmentAllocator::Free(void* bytes) { +#ifdef _DEBUG + Header* header = reinterpret_cast(bytes) - 1; + assert(header->offset < kSegmentSize); + assert(header->offset + header->size <= kSegmentSize); + // - 0xDA - freed. + ::memset(header + 1, 0xDA, header->size); +#endif + Free(); +} + +void SegmentAllocator::Seal(uint32_t allocations) { + SegmentState delta_state{ allocations, 1 }; + SegmentState old_state{ state.control.fetch_add(delta_state.control) }; + assert(old_state.allocations == 0); + assert(old_state.frees < allocations); + if(allocations == old_state.frees + 1) { + // We were the last to free a block inside this segment, so we must free it. + this->~SegmentAllocator(); + aligned_free(this); + } +} + +void SegmentAllocator::Free() { + SegmentState delta_state{ 0, 1 }; + SegmentState old_state{ state.control.fetch_add(delta_state.control) }; + assert(old_state.allocations == 0 || old_state.frees < old_state.allocations); + if(old_state.allocations == old_state.frees + 1) { + // We were the last to free a block inside this segment, so we must free it. + this->~SegmentAllocator(); + aligned_free(this); + } +} + +void* ThreadAllocator::Allocate(uint32_t size) { + if(!segment_allocator_) { + segment_allocator_ = reinterpret_cast(aligned_alloc(kCacheLineSize, + sizeof(SegmentAllocator))); + if(!segment_allocator_) { + return nullptr; + } + new(segment_allocator_) SegmentAllocator{}; + } + // Block is 16-byte aligned, after a 2-byte (8-byte in _DEBUG mode) header. + uint32_t block_size = static_cast(pad_alignment(size + sizeof(Header), + kBaseAlignment)); + uint32_t offset = Reserve(block_size); + if(segment_offset_ <= kSegmentSize) { + // The allocation succeeded inside the active segment. + uint8_t* buffer = segment_allocator_->buffer; +#ifdef _DEBUG + // - 0xCA - allocated. + ::memset(&buffer[offset], 0xCA, block_size); +#endif + Header* header = reinterpret_cast(&buffer[offset]); +#ifdef _DEBUG + new(header) Header(size, offset); +#else + new(header) Header(offset); +#endif + return header + 1; + } else { + // We filled the active segment; seal it. + segment_allocator_->Seal(allocations_); + segment_allocator_ = nullptr; + allocations_ = 0; + segment_offset_ = 0; + // Call self recursively, to allocate inside a new segment. + return Allocate(size); + } +} + +void* ThreadAllocator::AllocateAligned(uint32_t size, uint32_t alignment) { + if(!segment_allocator_) { + segment_allocator_ = reinterpret_cast(aligned_alloc(kCacheLineSize, + sizeof(SegmentAllocator))); + if(!segment_allocator_) { + return nullptr; + } + new(segment_allocator_) SegmentAllocator{}; + } + // Alignment must be >= base alignment, and a power of 2. + assert(alignment >= kBaseAlignment); + assert((alignment & (alignment - 1)) == 0); + // Block needs to be large enough to hold the user block, the header, and the align land fill. + // Max align land fill size is (alignment - kBaseAlignment). + uint32_t block_size = static_cast(pad_alignment( + size + sizeof(Header) + alignment - kBaseAlignment, + kBaseAlignment)); + uint32_t block_offset = Reserve(block_size); + if(segment_offset_ <= kSegmentSize) { + // The allocation succeeded inside the active segment. + uint8_t* buffer = segment_allocator_->buffer; +#ifdef _DEBUG + // - 0xEA - align land fill. + ::memset(&buffer[block_offset], 0xEA, block_size); +#endif + // Align the user block. + uint32_t user_offset = static_cast(pad_alignment(reinterpret_cast( + &buffer[block_offset]) + sizeof(Header), alignment) - + reinterpret_cast(&buffer[block_offset]) - sizeof(Header)); + assert(user_offset + sizeof(Header) + size <= block_size); + uint32_t offset = block_offset + user_offset; +#ifdef _DEBUG + // - 0xCA - allocated. + ::memset(&buffer[offset], 0xCA, size + sizeof(Header)); +#endif + Header* header = reinterpret_cast(&buffer[offset]); +#ifdef _DEBUG + new(header) Header(size, offset); +#else + new(header) Header(offset); +#endif + return header + 1; + } else { + // We filled the active segment; seal it. + segment_allocator_->Seal(allocations_); + segment_allocator_ = nullptr; + allocations_ = 0; + segment_offset_ = 0; + // Call self recursively, to allocate inside a new segment. + return AllocateAligned(size, alignment); + } +} +} // namespace lss_memory + +void* LssAllocator::Allocate(uint32_t size) { + return thread_allocators_[thread_index_].Allocate(size); +} + +void* LssAllocator::AllocateAligned(uint32_t size, uint32_t alignment) { + return thread_allocators_[thread_index_].AllocateAligned(size, alignment); +} + +void LssAllocator::Free(void* bytes) { + lss_memory::Header* header = reinterpret_cast(bytes) - 1; + uint8_t* block = reinterpret_cast(header); + uint32_t offset = header->offset + lss_memory::SegmentAllocator::kBufferOffset; + lss_memory::SegmentAllocator* segment_allocator = + reinterpret_cast(block - offset); + segment_allocator->Free(bytes); +} + +#undef thread_index_ + +} +} // namespace FASTER::core diff --git a/cc/src/core/lss_allocator.h b/cc/src/core/lss_allocator.h new file mode 100644 index 000000000..d1fe14504 --- /dev/null +++ b/cc/src/core/lss_allocator.h @@ -0,0 +1,237 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include +#include +#ifdef _DEBUG +#include +#endif + +#include "status.h" +#include "thread.h" + +/// A fast allocator intended for mostly-FIFO workloads (e.g., allocating contexts for file-I/O +/// callbacks). Each thread allocates by bumping the tail of its current segment; when it fills a +/// segment, it malloc()s a new one. Any thread frees by decrementing the allocation's segment's +/// ref count; when a (filled) segment's ref count reaches 0, we free() it. So long as the workload +/// is mostly FIFO, we don't leak memory. + +namespace FASTER { +namespace core { + +/// Internal classes and structures. +namespace lss_memory { + +/// Size of each segment (in bytes). (In experiments, a segment size of 16,000 worked well for +/// on Windows, while 8,000 worked well on Linux.) +#ifdef _WIN32 +static constexpr uint32_t kSegmentSize = 16000; +#else +static constexpr uint32_t kSegmentSize = 8000; +#endif + +/// Preserving Windows malloc() behavior, all LSS allocations are aligned to 16 bytes. +static constexpr uint32_t kBaseAlignment = 16; + +/// Header, prepended to all allocated blocks; used to find the ref count variable, to decrement it +/// when the block is freed. (The allocation size isn't needed, since LSS allocations are +/// essentially stack allocations; but _DEBUG mode includes it for the benefit of the caller.) +#ifdef _DEBUG +struct alignas(8) Header { + Header(uint32_t size_, uint32_t offset_) + : offset{ offset_ } + , size{ size_ } { + } + + /// Offset from the head of the segment allocator's buffer to the memory block. + uint32_t offset; + + /// Size of the memory block. + uint32_t size; +}; +static_assert(sizeof(Header) == 8, "Header is not 8 bytes!"); +#else +struct alignas(8) Header { + Header(uint16_t offset_) + : offset{ offset_ } { + } + + /// Offset from the head of the segment allocator's buffer to the memory block. + uint16_t offset; +}; +static_assert(sizeof(Header) == 8, "Header is not 8 bytes!"); +#endif + +class ThreadAllocator; + +class SegmentState { + public: + SegmentState() + : control{ 0 } { + } + + SegmentState(uint64_t control_) + : control{ control_ } { + } + + SegmentState(uint32_t allocations_, uint32_t frees_) + : frees{ frees_ } + , allocations{ allocations_ } { + } + + union { + struct { + /// Count of memory blocks freed inside this segment. Incremented on each free. Frees can + /// take place on any thread. + uint32_t frees; + /// If this segment is sealed, then the count of memory blocks allocated inside this + /// segment. Otherwise, zero. + uint32_t allocations; + }; + /// 64-bit control field, used so that threads can read the allocation count atomically at + /// the same time they increment the free count atomically. + std::atomic control; + }; +}; +static_assert(kSegmentSize < UINT16_MAX / 2, "kSegmentSize too large for offset size!"); + +/// Allocation takes place inside segments. When a segment is no longer needed, we add it to the +/// garbage list. +class SegmentAllocator { + public: + /// Offset from the head of the class to the head of its buffer_ field. +#ifdef _DEBUG + static constexpr uint32_t kBufferOffset = 8; +#else + static constexpr uint32_t kBufferOffset = 14; +#endif + + /// Initialize the segment allocator and allocate the segment. + SegmentAllocator() + : state{} { +#ifdef _DEBUG + // Debug LSS memory codes: + // - 0xBA - initialized, not allocated. + std::memset(buffer, 0xBA, kSegmentSize); +#endif + } + + /// Free the specified memory block. The block must be inside this segment! Returns true if the + /// segment was freed; otherwise, returns false. + void Free(void* bytes); + + /// Seal the segment--no more blocks will be allocated inside this segment. Returns true if the + /// segment was freed; otherwise, returns false. + void Seal(uint32_t blocks_allocated); + + private: + /// Decrement the active references count, effectively freeing one allocation. Also frees the + /// segment if (1) it is sealed and (2) its active references count is now zero. Returns true if + /// the segment was freed; otherwise, returns false. + void Free(); + + public: + /// Segment allocator state (8 bytes). + SegmentState state; + + /// This segment's memory. (First allocation's 8-byte Header starts at 8 (mod 16), so the + /// allocation's contents will start at 0 (mod 16), as desired.) + uint8_t buffer[kSegmentSize]; +}; + +/// Allocator for a single thread. Allocates only; frees are directed by the global allocator +/// object directly to the relevant segment allocator. +class alignas(64) ThreadAllocator { + public: + static constexpr uint32_t kCacheLineSize = 64; + + /// Initialize the thread allocator. The real work happens lazily, when Allocate() is called for + /// the first time. + ThreadAllocator() + : segment_allocator_{ nullptr } + , segment_offset_{ 0 } + , allocations_{ 0 } { + } + + /// Allocate a memory block of the specified size < kSegmentSize. If allocation fails, returns + /// nullptr. + void* Allocate(uint32_t size); + void* AllocateAligned(uint32_t size, uint32_t offset); + + private: + inline uint32_t Reserve(uint32_t block_size) { + assert(block_size <= kSegmentSize); + ++allocations_; + uint32_t result = segment_offset_; + assert(result <= kSegmentSize); + segment_offset_ += block_size; + return result; + } + + /// Segment inside which each thread's new allocations occur (pointer, 8 bytes). + SegmentAllocator* segment_allocator_; + + /// Offset, into the active segment, of the next allocation. + uint32_t segment_offset_; + + /// Number of blocks allocated inside the active segment. + uint32_t allocations_; +}; +static_assert(sizeof(ThreadAllocator) == 64, "sizeof(ThreadAllocator) != 64."); + +} // namespace lss_memory + +/// The LSS allocator allocates memory from a log-structured store, but does not perform garbage +/// collection. Memory is allocated from segments; each segment is freed only after all of its +/// allocations have been freed. This means that if a single allocation inside a segment is still +/// alive, the entire segment is still alive. +/// The LSS allocator works well in the case where memory usage is almost FIFO. In that case, all +/// of the segment's allocations will eventually be freed, so the segment will be freed. The LSS +/// allocator is intended to replace the (synchronous) function call stack, for asynchronous +/// continuations. +class LssAllocator { + public: + /// Maximum number of threads supported. For each possible thread, we reserve an 8-byte + /// ThreadAllocator; so the memory required is 8 * (kMaxThreadCount) bytes. For each actual + /// thread, we reserve a full SegmentAllocator, of size approximately kSegmentSize. + static constexpr size_t kMaxThreadCount = Thread::kMaxNumThreads; + + /// Size of each segment (in bytes). + static constexpr uint32_t kSegmentSize = lss_memory::kSegmentSize; + + /// Preserving Windows malloc() behavior, all LSS allocations are aligned to 16 bytes. + static constexpr uint32_t kBaseAlignment = lss_memory::kBaseAlignment; + + /// Initialize the LSS allocator. The real work happens lazily, when a thread calls Allocate() + /// for the first time. + LssAllocator() { + for(size_t idx = 0; idx < kMaxThreadCount; ++idx) { + thread_allocators_[idx] = lss_memory::ThreadAllocator{}; + } + } + + /// Allocate a memory block of the specified size. Note that size must be < kSegmentSize, since + /// the allocation will take place inside a segment. The Allocate() code is ultimately single- + /// threaded, since we maintain a separate ThreadAllocator per thread, each with its own + /// SegmentAllocator. If allocation fails, returns nullptr. + void* Allocate(uint32_t size); + void* AllocateAligned(uint32_t size, uint32_t alignment); + + /// Free the specified memory block. The Free() code is thread-safe, since the Free() request is + /// always directed to the SegmentAllocator() that originally allocated the code--regardless of + /// what thread it is issued from. + void Free(void* bytes); + + private: + /// To reduce contention (and avoid needing atomic primitives in the allocation path), we + /// maintain a unique allocator per thread. + lss_memory::ThreadAllocator thread_allocators_[kMaxThreadCount]; +}; + +/// The global LSS allocator instance. +extern LssAllocator lss_allocator; + +} +} // namespace FASTER::core diff --git a/cc/src/core/malloc_fixed_page_size.h b/cc/src/core/malloc_fixed_page_size.h new file mode 100644 index 000000000..730e6df3f --- /dev/null +++ b/cc/src/core/malloc_fixed_page_size.h @@ -0,0 +1,582 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "alloc.h" +#include "light_epoch.h" + +namespace FASTER { +namespace core { + +/// The allocator used for the hash table's overflow buckets. + +/// Address into a fixed page. +struct FixedPageAddress { + static constexpr uint64_t kInvalidAddress = 0; + + /// A fixed-page address is 8 bytes. + /// --of which 48 bits are used for the address. (The remaining 16 bits are used by the hash + /// table, for control bits and the tag.) + static constexpr uint64_t kAddressBits = 48; + static constexpr uint64_t kMaxAddress = ((uint64_t)1 << kAddressBits) - 1; + + /// --of which 20 bits are used for offsets into a page, of size 2^20 = 1 million items. + static constexpr uint64_t kOffsetBits = 20; + static constexpr uint64_t kMaxOffset = ((uint64_t)1 << kOffsetBits) - 1; + + /// --and the remaining 28 bits are used for the page index, allowing for approximately 256 + /// million pages. + static constexpr uint64_t kPageBits = kAddressBits - kOffsetBits; + static constexpr uint64_t kMaxPage = ((uint64_t)1 << kPageBits) - 1; + + FixedPageAddress() + : control_{ 0 } { + } + FixedPageAddress(uint64_t control) + : control_{ control } { + } + + bool operator==(const FixedPageAddress& other) const { + assert(reserved == 0); + assert(other.reserved == 0); + return control_ == other.control_; + } + bool operator<(const FixedPageAddress& other) const { + assert(reserved == 0); + assert(other.reserved == 0); + return control_ < other.control_; + } + bool operator>(const FixedPageAddress& other) const { + assert(reserved == 0); + assert(other.reserved == 0); + return control_ > other.control_; + } + bool operator>=(const FixedPageAddress& other) const { + assert(reserved == 0); + assert(other.reserved == 0); + return control_ >= other.control_; + } + FixedPageAddress operator++() { + return FixedPageAddress{ ++control_ }; + } + + uint32_t offset() const { + return static_cast(offset_); + } + uint64_t page() const { + return page_; + } + uint64_t control() const { + return control_; + } + + union { + struct { + uint64_t offset_ : kOffsetBits; // 20 bits + uint64_t page_ : kPageBits; // 28 bits + uint64_t reserved : 64 - kAddressBits; // 16 bits + }; + uint64_t control_; + }; +}; +static_assert(sizeof(FixedPageAddress) == 8, "sizeof(FixedPageAddress) != 8"); + +/// Atomic address into a fixed page. +class AtomicFixedPageAddress { + public: + AtomicFixedPageAddress(const FixedPageAddress& address) + : control_{ address.control_ } { + } + + /// Atomic access. + inline FixedPageAddress load() const { + return FixedPageAddress{ control_.load() }; + } + void store(FixedPageAddress value) { + control_.store(value.control_); + } + FixedPageAddress operator++(int) { + return FixedPageAddress{ control_++ }; + } + + + private: + /// Atomic access to the address. + std::atomic control_; +}; +static_assert(sizeof(AtomicFixedPageAddress) == 8, "sizeof(AtomicFixedPageAddress) != 8"); + +struct FreeAddress { + FixedPageAddress removed_addr; + uint64_t removal_epoch; +}; + +template +class FixedPage { + public: + typedef T item_t; + static constexpr uint64_t kPageSize = FixedPageAddress::kMaxOffset + 1; + + /// Accessors. + inline const item_t& element(uint32_t offset) const { + assert(offset <= FixedPageAddress::kMaxOffset); + return elements_[offset]; + } + inline item_t& element(uint32_t offset) { + assert(offset <= FixedPageAddress::kMaxOffset); + return elements_[offset]; + } + + private: + /// The page's contents. + item_t elements_[kPageSize]; + static_assert(alignof(item_t) <= Constants::kCacheLineBytes, + "alignof(item_t) > Constants::kCacheLineBytes"); +}; + +template +class FixedPageArray { + public: + typedef T item_t; + typedef FixedPage page_t; + typedef FixedPageArray array_t; + + protected: + FixedPageArray(uint64_t alignment_, uint64_t size_, const array_t* old_array) + : alignment{ alignment_ } + , size{ size_ } { + assert(Utility::IsPowerOfTwo(size)); + uint64_t idx = 0; + if(old_array) { + assert(old_array->size < size); + for(; idx < old_array->size; ++idx) { + page_t* page; + page = old_array->pages()[idx].load(std::memory_order_acquire); + while(page == nullptr) { + std::this_thread::yield(); + page = old_array->pages()[idx].load(std::memory_order_acquire); + } + pages()[idx] = page; + } + } + for(; idx < size; ++idx) { + pages()[idx] = nullptr; + } + } + + public: + static FixedPageArray* Create(uint64_t alignment, uint64_t size, const array_t* old_array) { + void* buffer = std::malloc(sizeof(array_t) + size * sizeof(std::atomic)); + return new(buffer) array_t{ alignment, size, old_array }; + } + + static void Delete(array_t* arr, bool owns_pages) { + assert(arr); + if(owns_pages) { + for(uint64_t idx = 0; idx < arr->size; ++idx) { + page_t* page = arr->pages()[idx].load(std::memory_order_acquire); + if(page) { + page->~FixedPage(); + aligned_free(page); + } + } + } + arr->~FixedPageArray(); + std::free(arr); + } + + /// Used by allocator.Get(). + inline page_t* Get(uint64_t page_idx) { + assert(page_idx < size); + return pages()[page_idx].load(std::memory_order_acquire); + } + + /// Used by allocator.Allocate(). + inline page_t* GetOrAdd(uint64_t page_idx) { + assert(page_idx < size); + page_t* page = pages()[page_idx].load(std::memory_order_acquire); + while(page == nullptr) { + page = AddPage(page_idx); + } + return page; + } + + inline page_t* AddPage(uint64_t page_idx) { + assert(page_idx < size); + void* buffer = aligned_alloc(alignment, sizeof(page_t)); + page_t* new_page = new(buffer) page_t{}; + page_t* expected = nullptr; + if(pages()[page_idx].compare_exchange_strong(expected, new_page, std::memory_order_release)) { + return new_page; + } else { + new_page->~page_t(); + aligned_free(new_page); + return expected; + } + } + + private: + /// Accessors, since zero-length arrays at the ends of structs aren't standard in C++. + const std::atomic* pages() const { + return reinterpret_cast*>(this + 1); + } + std::atomic* pages() { + return reinterpret_cast*>(this + 1); + } + + public: + /// Alignment at which each page is allocated. + const uint64_t alignment; + /// Maximum number of pages in the array; fixed at time of construction. + const uint64_t size; + /// Followed by [size] std::atomic<> pointers to (page_t) pages. (Not shown here.) +}; + +class alignas(Constants::kCacheLineBytes) FreeList { + public: + std::deque free_list; +}; + +template +class MallocFixedPageSize { + public: + typedef T item_t; + typedef D disk_t; + typedef typename D::file_t file_t; + typedef FixedPage page_t; + typedef FixedPageArray array_t; + typedef MallocFixedPageSize alloc_t; + + MallocFixedPageSize() + : alignment_{ UINT64_MAX } + , count_{ 0 } + , epoch_{ nullptr } + , page_array_{ nullptr } + , disk_{ nullptr } + , pending_checkpoint_writes_{ 0 } + , pending_recover_reads_{ 0 } + , checkpoint_pending_{ false } + , checkpoint_failed_{ false } + , recover_pending_{ false } + , recover_failed_{ false } { + } + + ~MallocFixedPageSize() { + if(page_array_.load() != nullptr) { + array_t::Delete(page_array_.load(), true); + } + } + + inline void Initialize(uint64_t alignment, LightEpoch& epoch) { + if(page_array_.load() != nullptr) { + array_t::Delete(page_array_.load(), true); + } + alignment_ = alignment; + count_.store(0); + epoch_ = &epoch; + disk_ = nullptr; + pending_checkpoint_writes_ = 0; + pending_recover_reads_ = 0; + checkpoint_pending_ = false; + checkpoint_failed_ = false; + recover_pending_ = false; + recover_failed_ = false; + + array_t* page_array = array_t::Create(alignment, 2, nullptr); + page_array->AddPage(0); + page_array_.store(page_array, std::memory_order_release); + // Allocate the null pointer. + Allocate(); + } + + inline void Uninitialize() { + if(page_array_.load() != nullptr) { + array_t::Delete(page_array_.load(), true); + page_array_.store(nullptr); + } + } + + inline item_t& Get(FixedPageAddress address) { + page_t* page = page_array_.load(std::memory_order_acquire)->Get(address.page()); + assert(page); + return page->element(address.offset()); + } + inline const item_t& Get(FixedPageAddress address) const { + page_t* page = page_array_.load(std::memory_order_acquire)->Get(address.page()); + assert(page); + return page->element(address.offset()); + } + + FixedPageAddress Allocate(); + + void FreeAtEpoch(FixedPageAddress addr, uint64_t removed_epoch) { + free_list().push_back(FreeAddress{ addr, removed_epoch }); + } + + /// Checkpointing and recovery. + Status Checkpoint(disk_t& disk, file_t&& file, uint64_t& size); + Status CheckpointComplete(bool wait); + + Status Recover(disk_t& disk, file_t&& file, uint64_t file_size, FixedPageAddress count); + Status RecoverComplete(bool wait); + + std::deque& free_list() { + return free_list_[Thread::id()].free_list; + } + const std::deque& free_list() const { + return free_list_[Thread::id()].free_list; + } + + FixedPageAddress count() const { + return count_.load(); + } + + private: + /// Checkpointing and recovery. + class AsyncIoContext : public IAsyncContext { + public: + AsyncIoContext(alloc_t* allocator_) + : allocator{ allocator_ } { + } + + /// The deep-copy constructor + AsyncIoContext(AsyncIoContext& other) + : allocator{ other.allocator } { + } + + protected: + Status DeepCopy_Internal(IAsyncContext*& context_copy) final { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + public: + alloc_t* allocator; + }; + + array_t* ExpandArray(array_t* expected, uint64_t new_size); + + private: + /// Alignment at which each page is allocated. + uint64_t alignment_; + /// Array of all of the pages we've allocated. + std::atomic page_array_; + /// How many elements we've allocated. + AtomicFixedPageAddress count_; + + LightEpoch* epoch_; + + /// State for ongoing checkpoint/recovery. + disk_t* disk_; + file_t file_; + std::atomic pending_checkpoint_writes_; + std::atomic pending_recover_reads_; + std::atomic checkpoint_pending_; + std::atomic checkpoint_failed_; + std::atomic recover_pending_; + std::atomic recover_failed_; + + FreeList free_list_[Thread::kMaxNumThreads]; +}; + +/// Implementations. +template +Status MallocFixedPageSize::Checkpoint(disk_t& disk, file_t&& file, uint64_t& size) { + constexpr uint32_t kWriteSize = page_t::kPageSize * sizeof(item_t); + + auto callback = [](IAsyncContext* ctxt, Status result, size_t bytes_transferred) { + CallbackContext context{ ctxt }; + if(result != Status::Ok) { + context->allocator->checkpoint_failed_ = true; + } + if(--context->allocator->pending_checkpoint_writes_ == 0) { + result = context->allocator->file_.Close(); + if(result != Status::Ok) { + context->allocator->checkpoint_failed_ = true; + } + context->allocator->checkpoint_pending_ = false; + } + }; + + disk_ = &disk; + file_ = std::move(file); + size = 0; + checkpoint_failed_ = false; + array_t* page_array = page_array_.load(); + FixedPageAddress count = count_.load(); + + uint64_t num_levels = count.page() + (count.offset() > 0 ? 1 : 0); + assert(!checkpoint_pending_); + assert(pending_checkpoint_writes_ == 0); + checkpoint_pending_ = true; + pending_checkpoint_writes_ = num_levels; + for(uint64_t idx = 0; idx < num_levels; ++idx) { + AsyncIoContext context{ this }; + RETURN_NOT_OK(file_.WriteAsync(page_array->Get(idx), idx * kWriteSize, kWriteSize, callback, + context)); + } + size = count.control_ * sizeof(item_t); + return Status::Ok; +} + +template +Status MallocFixedPageSize::CheckpointComplete(bool wait) { + disk_->TryComplete(); + bool complete = !checkpoint_pending_.load(); + while(wait && !complete) { + disk_->TryComplete(); + complete = !checkpoint_pending_.load(); + std::this_thread::yield(); + } + if(!complete) { + return Status::Pending; + } else { + return checkpoint_failed_ ? Status::IOError : Status::Ok; + } +} + +template +Status MallocFixedPageSize::Recover(disk_t& disk, file_t&& file, uint64_t file_size, + FixedPageAddress count) { + constexpr uint64_t kReadSize = page_t::kPageSize * sizeof(item_t); + + auto callback = [](IAsyncContext* ctxt, Status result, size_t bytes_transferred) { + CallbackContext context{ ctxt }; + if(result != Status::Ok) { + context->allocator->recover_failed_ = true; + } + if(--context->allocator->pending_recover_reads_ == 0) { + result = context->allocator->file_.Close(); + if(result != Status::Ok) { + context->allocator->recover_failed_ = true; + } + context->allocator->recover_pending_ = false; + } + }; + + assert(file_size % sizeof(item_t) == 0); + disk_ = &disk; + file_ = std::move(file); + recover_failed_ = false; + + // The size reserved by recovery is >= the size checkpointed to disk. + FixedPageAddress file_end_addr{ file_size / sizeof(item_t) }; + uint64_t num_file_levels = file_end_addr.page() + (file_end_addr.offset() > 0 ? 1 : 0); + assert(num_file_levels > 0); + assert(count >= file_end_addr); + uint64_t num_levels = count.page() + (count.offset() > 0 ? 1 : 0); + assert(num_levels > 0); + + array_t* page_array = page_array_.load(); + // Ensure that the allocator has enough pages. + if(page_array->size < num_levels) { + uint64_t new_size = next_power_of_two(num_levels); + page_array = ExpandArray(page_array, new_size); + } + count_.store(count); + assert(!recover_pending_); + assert(pending_recover_reads_.load() == 0); + recover_pending_ = true; + pending_recover_reads_ = num_file_levels; + for(uint64_t idx = 0; idx < num_file_levels; ++idx) { + //read a full page + AsyncIoContext context{ this }; + RETURN_NOT_OK(file_.ReadAsync(idx * kReadSize, page_array->GetOrAdd(idx), kReadSize, callback, + context)); + } + return Status::Ok; +} + +template +Status MallocFixedPageSize::RecoverComplete(bool wait) { + disk_->TryComplete(); + bool complete = !recover_pending_.load(); + while(wait && !complete) { + disk_->TryComplete(); + complete = !recover_pending_.load(); + std::this_thread::yield(); + } + if(!complete) { + return Status::Pending; + } else { + return recover_failed_ ? Status::IOError : Status::Ok; + } +} + +template +FixedPageArray* MallocFixedPageSize::ExpandArray(array_t* expected, uint64_t new_size) { + class Delete_Context : public IAsyncContext { + public: + Delete_Context(array_t* arr_) + : arr{ arr_ } { + } + /// The deep-copy constructor. + Delete_Context(const Delete_Context& other) + : arr{ other.arr } { + } + protected: + Status DeepCopy_Internal(IAsyncContext*& context_copy) final { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + public: + array_t* arr; + }; + + auto delete_callback = [](IAsyncContext* ctxt) { + CallbackContext context{ ctxt }; + array_t::Delete(context->arr, false); + }; + + assert(Utility::IsPowerOfTwo(new_size)); + do { + array_t* new_array = array_t::Create(alignment_, new_size, expected); + if(page_array_.compare_exchange_strong(expected, new_array, std::memory_order_release)) { + // Have to free the old array, under epoch protection. + Delete_Context context{ expected }; + IAsyncContext* context_copy; + Status result = context.DeepCopy(context_copy); + assert(result == Status::Ok); + epoch_->BumpCurrentEpoch(delete_callback, context_copy); + return new_array; + } else { + new_array->~array_t(); + std::free(new_array); + } + } while(expected->size < new_size); + return expected; +} + +template +inline FixedPageAddress MallocFixedPageSize::Allocate() { + if(!free_list().empty()) { + // Check the head of the free list. + if(free_list().front().removal_epoch <= epoch_->safe_to_reclaim_epoch.load()) { + FixedPageAddress removed_addr = free_list().front().removed_addr; + free_list().pop_front(); + return removed_addr; + } + } + // Determine insertion page_index. + FixedPageAddress addr = count_++; + array_t* page_array = page_array_.load(std::memory_order_acquire); + if(addr.page() >= page_array->size) { + // Need to resize the page array. + page_array = ExpandArray(page_array, next_power_of_two(addr.page() + 1)); + } + if(addr.offset() == 0 && addr.page() + 1 < page_array->size) { + // Add the next page early, to try to avoid blocking other threads. + page_array->AddPage(addr.page() + 1); + } + page_array->GetOrAdd(addr.page()); + return addr; +} + +} +} // namespace FASTER::core diff --git a/cc/src/core/native_buffer_pool.h b/cc/src/core/native_buffer_pool.h new file mode 100644 index 000000000..d1a2c387d --- /dev/null +++ b/cc/src/core/native_buffer_pool.h @@ -0,0 +1,188 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include +#include +#include + +#include "alloc.h" +#include "utility.h" + +#ifdef _WIN32 +#include +#pragma intrinsic(_BitScanReverse) + +/// Microsoft's concurrency::concurrent_queue is based on Intel's tbb::concurrent_queue. +#include +template +using concurrent_queue = concurrency::concurrent_queue; +#else +namespace FASTER { +/// Convert GCC's __builtin_clz() to Microsoft's _BitScanReverse. +inline uint8_t _BitScanReverse(unsigned long* index, uint32_t mask) { + bool found = mask > 0; + *index = 31 - __builtin_clz(mask); + return found; +} +} + +#include +template +using concurrent_queue = tbb::concurrent_queue; +#endif + +namespace FASTER { +namespace core { + +/// A buffer pool used for file I/Os. + +class NativeSectorAlignedBufferPool; + +/// A sector-aligned memory block, along with offsets into the block. +class SectorAlignedMemory { + public: + /// Default constructor. + SectorAlignedMemory() + : buffer_{ nullptr } + , valid_offset{ 0 } + , required_bytes{ 0 } + , available_bytes{ 0 } + , level_{ 0 } + , pool_{ nullptr } { + } + SectorAlignedMemory(uint8_t* buffer, uint32_t level, NativeSectorAlignedBufferPool* pool) + : buffer_{ buffer } + , valid_offset{ 0 } + , required_bytes{ 0 } + , available_bytes{ 0 } + , level_{ level } + , pool_{ pool } { + } + /// No copy constructor. + SectorAlignedMemory(const SectorAlignedMemory&) = delete; + /// Move constructor. + SectorAlignedMemory(SectorAlignedMemory&& other) + : buffer_{ other.buffer_ } + , valid_offset{ other.valid_offset } + , required_bytes{ other.required_bytes } + , available_bytes{ other.available_bytes } + , level_{ other.level_ } + , pool_{ other.pool_ } { + other.buffer_ = nullptr; + other.pool_ = nullptr; + } + + inline ~SectorAlignedMemory(); + + /// Move assignment operator. + inline SectorAlignedMemory& operator=(SectorAlignedMemory&& other); + + inline void CopyValidBytesToAddress(uint8_t* pt) const { + std::memcpy(pt, &buffer_[valid_offset], required_bytes); + } + inline uint8_t* GetValidPointer() { + return &buffer_[valid_offset]; + } + inline uint8_t* buffer() { + return buffer_; + } + + private: + uint8_t* buffer_; + public: + uint32_t valid_offset; + uint32_t required_bytes; + uint32_t available_bytes; + private: + uint32_t level_; + NativeSectorAlignedBufferPool* pool_; +}; +static_assert(sizeof(SectorAlignedMemory) == 32, "sizeof(SectorAlignedMemory) != 32"); + +/// Aligned buffer pool is a pool of memory. +/// Internally, it is organized as an array of concurrent queues where each concurrent +/// queue represents a memory of size in particular range. queue_[i] contains memory +/// segments each of size (2^i * sectorSize). +class NativeSectorAlignedBufferPool { + private: + static constexpr uint32_t kLevels = 32; + + public: + NativeSectorAlignedBufferPool(uint32_t recordSize, uint32_t sectorSize) + : record_size_{ recordSize } + , sector_size_{ sectorSize } { + } + + inline void Return(uint32_t level, uint8_t* buffer) { + assert(level < kLevels); + queue_[level].push(buffer); + } + inline SectorAlignedMemory Get(uint32_t numRecords); + + private: + uint32_t Level(uint32_t sectors) { + assert(sectors > 0); + if(sectors == 1) { + return 0; + } + // BSR returns the page_index k of the most-significant 1 bit. So 2^(k+1) > (sectors - 1) >= + // 2^k, which means 2^(k+1) >= sectors > 2^k. + unsigned long k; + _BitScanReverse(&k, sectors - 1); + return k + 1; + } + + uint32_t record_size_; + uint32_t sector_size_; + /// Level 0 caches memory allocations of size (sectorSize); level n+1 caches allocations of size + /// (sectorSize) * 2^n. + concurrent_queue queue_[kLevels]; +}; + +/// Implementations. +inline SectorAlignedMemory& SectorAlignedMemory::operator=(SectorAlignedMemory&& other) { + if(buffer_ == other.buffer_) { + // Self-assignment is a no-op. + return *this; + } + if(buffer_ != nullptr) { + // Return our buffer to the pool, before taking ownership of a new buffer. + pool_->Return(level_, buffer_); + } + buffer_ = other.buffer_; + valid_offset = other.valid_offset; + required_bytes = other.required_bytes; + available_bytes = other.available_bytes; + level_ = other.level_; + pool_ = other.pool_; + + // We own the buffer now; other SectorAlignedMemory does not. + other.buffer_ = nullptr; + other.pool_ = nullptr; + return *this; +} + +inline SectorAlignedMemory::~SectorAlignedMemory() { + if(buffer_) { + pool_->Return(level_, buffer_); + } +} + +inline SectorAlignedMemory NativeSectorAlignedBufferPool::Get(uint32_t numRecords) { + // How many sectors do we need? + uint32_t sectors_required = (numRecords * record_size_ + sector_size_ - 1) / sector_size_; + uint32_t level = Level(sectors_required); + uint8_t* buffer; + if(queue_[level].try_pop(buffer)) { + return SectorAlignedMemory{ buffer, level, this }; + } else { + uint8_t* buffer = reinterpret_cast(aligned_alloc(sector_size_, + sector_size_ * (1 << level))); + return SectorAlignedMemory{ buffer, level, this }; + } +} + +} +} // namespace FASTER::core \ No newline at end of file diff --git a/cc/src/core/persistent_memory_malloc.h b/cc/src/core/persistent_memory_malloc.h new file mode 100644 index 000000000..abaa0b9d0 --- /dev/null +++ b/cc/src/core/persistent_memory_malloc.h @@ -0,0 +1,1021 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "device/file_system_disk.h" +#include "address.h" +#include "async_result_types.h" +#include "gc_state.h" +#include "light_epoch.h" +#include "native_buffer_pool.h" +#include "recovery_status.h" +#include "status.h" + +namespace FASTER { +namespace core { + +/// The log allocator, used by FASTER to store records. + +enum class FlushStatus : uint8_t { + Flushed, + InProgress +}; + +enum class CloseStatus : uint8_t { + Closed, + Open +}; + +/// Pack flush- and close-status into a single 16-bit value. +/// State transitions are: +/// { Flushed, Closed } (default state) +/// --> { InProgress, Open } (when issuing the flush to disk) +/// --> either { . , Closed} (when moving the head address forward) +/// or { Flushed, . } (when the flush completes). +struct FlushCloseStatus { + FlushCloseStatus() + : flush{ FlushStatus::Flushed } + , close{ CloseStatus::Closed } { + } + + FlushCloseStatus(FlushStatus flush_, CloseStatus close_) + : flush{ flush_ } + , close{ close_ } { + } + + FlushCloseStatus(uint16_t control_) + : control{ control_ } { + } + + /// Is the page ready for use? + inline bool Ready() const { + return flush == FlushStatus::Flushed && close == CloseStatus::Open; + } + + union { + struct { + FlushStatus flush; + CloseStatus close; + }; + uint16_t control; + }; +}; +static_assert(sizeof(FlushCloseStatus) == 2, "sizeof(FlushCloseStatus) != 2"); + +/// Atomic version of FlushCloseStatus. Can set and get flush- and close- status, together, +/// atomically. +class AtomicFlushCloseStatus { + public: + AtomicFlushCloseStatus() + : status_{} { + } + + inline void store(FlushStatus flush, CloseStatus close) { + // Sets flush and close statuses, atomically. + FlushCloseStatus status{ flush, close }; + control_.store(status.control); + } + + inline FlushCloseStatus load() const { + // Gets flush and close statuses, atomically. + return FlushCloseStatus{ control_.load() }; + } + + inline bool compare_exchange_weak(FlushCloseStatus& expected, FlushCloseStatus value) { + uint16_t expected_control = expected.control; + bool result = control_.compare_exchange_weak(expected_control, value.control); + expected.control = expected_control; + return result; + } + inline bool compare_exchange_strong(FlushCloseStatus& expected, FlushCloseStatus value) { + uint16_t expected_control = expected.control; + bool result = control_.compare_exchange_strong(expected_control, value.control); + expected.control = expected_control; + return result; + } + + union { + FlushCloseStatus status_; + std::atomic control_; + }; +}; +static_assert(sizeof(AtomicFlushCloseStatus) == 2, "sizeof(FlushCloseStatus) != 2"); + +struct FullPageStatus { + FullPageStatus() + : LastFlushedUntilAddress{ 0 } + , status{} { + } + + AtomicAddress LastFlushedUntilAddress; + AtomicFlushCloseStatus status; +}; +static_assert(sizeof(FullPageStatus) == 16, "sizeof(FullPageStatus) != 16"); + +/// Page and offset of the tail of the log. Can reserve space within the current page or move to a +/// new page. +class PageOffset { + public: + PageOffset(uint32_t page, uint64_t offset) + : offset_{ offset } + , page_{ page } { + assert(page <= Address::kMaxPage); + } + + PageOffset(uint64_t control) + : control_{ control } { + } + + PageOffset(const Address& address) + : offset_{ address.offset() } + , page_{ address.page() } { + } + + /// Accessors. + inline uint64_t offset() const { + return offset_; + } + inline uint32_t page() const { + return static_cast(page_); + } + inline uint64_t control() const { + return control_; + } + + /// Conversion operator. + inline operator Address() const { + assert(offset_ <= Address::kMaxOffset); + return Address{ page(), static_cast(offset()) }; + } + + private: + /// Use 41 bits for offset, which gives us approximately 2 PB of overflow space, for + /// Reserve(). + union { + struct { + uint64_t offset_ : 64 - Address::kPageBits; + uint64_t page_ : Address::kPageBits; + }; + uint64_t control_; + }; +}; +static_assert(sizeof(PageOffset) == 8, "sizeof(PageOffset) != 8"); + +/// Atomic page + offset marker. Can Reserve() space from current page, or move to NewPage(). +class AtomicPageOffset { + public: + AtomicPageOffset() + : control_{ 0 } { + } + + AtomicPageOffset(uint32_t page, uint64_t offset) + : control_{ PageOffset{ page, offset } .control() } { + } + + AtomicPageOffset(const Address& address) { + PageOffset page_offset{ address }; + control_.store(page_offset.control()); + } + + /// Reserve space within the current page. Can overflow the page boundary (so result offset > + /// Address::kMaxOffset). + inline PageOffset Reserve(uint32_t num_slots) { + assert(num_slots <= Address::kMaxOffset); + PageOffset offset{ 0, num_slots }; + return PageOffset{ control_.fetch_add(offset.control()) }; + } + + /// Move to the next page. The compare-and-swap can fail. Returns "true" if some thread advanced + /// the thread; sets "won_cas" = "true" if this thread won the CAS, which means it has been + /// chosen to set up the new page. + inline bool NewPage(uint32_t old_page, bool& won_cas) { + assert(old_page < Address::kMaxPage); + won_cas = false; + PageOffset expected_page_offset = load(); + if(old_page != expected_page_offset.page()) { + // Another thread already moved to the new page. + assert(old_page < expected_page_offset.page()); + return true; + } + PageOffset new_page{ old_page + 1, 0 }; + uint64_t expected = expected_page_offset.control(); + // Try to move to a new page. + won_cas = control_.compare_exchange_strong(expected, new_page.control()); + return PageOffset{ expected } .page() > old_page; + } + + inline PageOffset load() const { + return PageOffset{ control_.load() }; + } + inline void store(Address address) { + PageOffset page_offset{ address.page(), address.offset() }; + control_.store(page_offset.control()); + } + + private: + union { + /// Atomic access to the page+offset. + std::atomic control_; + }; +}; +static_assert(sizeof(AtomicPageOffset) == 8, "sizeof(AtomicPageOffset) != 8"); + +/// The main allocator. +template +class PersistentMemoryMalloc { + public: + typedef D disk_t; + typedef typename D::file_t file_t; + typedef typename D::log_file_t log_file_t; + typedef PersistentMemoryMalloc alloc_t; + + /// Each page in the buffer is 2^25 bytes (= 32 MB). + static constexpr uint64_t kPageSize = Address::kMaxOffset + 1; + + /// The first 4 HLOG pages should be below the head (i.e., being flushed to disk). + static constexpr uint32_t kNumHeadPages = 4; + + PersistentMemoryMalloc(uint64_t log_size, LightEpoch& epoch, disk_t& disk_, log_file_t& file_, + Address start_address, double log_mutable_fraction) + : sector_size{ static_cast(file_.alignment()) } + , epoch_{ &epoch } + , disk{ &disk_ } + , file{ &file_ } + , read_buffer_pool{ 1, sector_size } + , io_buffer_pool{ 1, sector_size } + , read_only_address{ start_address } + , safe_read_only_address{ start_address } + , head_address{ start_address } + , safe_head_address{ start_address } + , flushed_until_address{ start_address } + , begin_address{ start_address } + , tail_page_offset_{ start_address } + , buffer_size_{ 0 } + , pages_{ nullptr } + , page_status_{ nullptr } { + assert(start_address.page() <= Address::kMaxPage); + + if(log_size % kPageSize != 0) { + throw std::invalid_argument{ "Log size must be a multiple of 32 MB" }; + } + if(log_size % kPageSize > UINT32_MAX) { + throw std::invalid_argument{ "Log size must be <= 128 PB" }; + } + buffer_size_ = static_cast(log_size / kPageSize); + + if(buffer_size_ <= kNumHeadPages + 1) { + throw std::invalid_argument{ "Must have at least 2 non-head pages" }; + } + // The latest N pages should be mutable. + num_mutable_pages_ = static_cast(log_mutable_fraction * buffer_size_); + if(num_mutable_pages_ <= 1) { + // Need at least two mutable pages: one to write to, and one to open up when the previous + // mutable page is full. + throw std::invalid_argument{ "Must have at least 2 mutable pages" }; + } + + pages_ = new uint8_t* [buffer_size_]; + for(uint32_t idx = 0; idx < buffer_size_; ++idx) { + pages_[idx] = nullptr; + } + + page_status_ = new FullPageStatus[buffer_size_]; + + PageOffset tail_page_offset = tail_page_offset_.load(); + AllocatePage(tail_page_offset.page()); + AllocatePage(tail_page_offset.page() + 1); + } + + PersistentMemoryMalloc(uint64_t log_size, LightEpoch& epoch, disk_t& disk_, log_file_t& file_, + double log_mutable_fraction) + : PersistentMemoryMalloc(log_size, epoch, disk_, file_, Address{ 0 }, log_mutable_fraction) { + /// Allocate the invalid page. Supports allocations aligned up to kCacheLineBytes. + uint32_t discard; + Allocate(Constants::kCacheLineBytes, discard); + assert(discard == UINT32_MAX); + /// Move the head and read-only address past the invalid page. + Address tail_address = tail_page_offset_.load(); + begin_address.store(tail_address); + read_only_address.store(tail_address); + safe_read_only_address.store(tail_address); + head_address.store(tail_address); + safe_head_address.store(tail_address); + } + + ~PersistentMemoryMalloc() { + if(pages_) { + for(uint32_t idx = 0; idx < buffer_size_; ++idx) { + if(pages_[idx]) { + aligned_free(pages_[idx]); + } + } + delete[] pages_; + } + if(page_status_) { + delete[] page_status_; + } + } + + inline const uint8_t* Page(uint32_t page) const { + assert(page <= Address::kMaxPage); + return pages_[page % buffer_size_]; + } + inline uint8_t* Page(uint32_t page) { + assert(page <= Address::kMaxPage); + return pages_[page % buffer_size_]; + } + + inline const FullPageStatus& PageStatus(uint32_t page) const { + assert(page <= Address::kMaxPage); + return page_status_[page % buffer_size_]; + } + inline FullPageStatus& PageStatus(uint32_t page) { + assert(page <= Address::kMaxPage); + return page_status_[page % buffer_size_]; + } + + inline uint32_t buffer_size() const { + return buffer_size_; + } + + /// Read the tail page + offset, atomically, and convert it to an address. + inline Address GetTailAddress() const { + PageOffset tail_page_offset = tail_page_offset_.load(); + return Address{ tail_page_offset.page(), std::min(Address::kMaxOffset, + static_cast(tail_page_offset.offset())) }; + } + + inline const uint8_t* Get(Address address) const { + return Page(address.page()) + address.offset(); + } + inline uint8_t* Get(Address address) { + return Page(address.page()) + address.offset(); + } + + /// Key function used to allocate memory for a specified number of items. If the current page is + /// full, returns Address::kInvalidAddress and sets closed_page to the current page index. The + /// caller should Refresh() the epoch and call NewPage() until successful, before trying to + /// Allocate() again. + inline Address Allocate(uint32_t num_slots, uint32_t& closed_page); + + /// Tries to move the allocator to a new page; used when the current page is full. Returns "true" + /// if the page advanced (so the caller can try to allocate, again). + inline bool NewPage(uint32_t old_page); + + /// Invoked by users to obtain a record from disk. It uses sector aligned memory to read + /// the record efficiently into memory. + inline void AsyncGetFromDisk(Address address, uint32_t num_records, AsyncIOCallback callback, + AsyncIOContext& context); + + /// Used by applications to make the current state of the database immutable quickly + Address ShiftReadOnlyToTail(); + + void Truncate(GcState::truncate_callback_t callback); + + /// Action to be performed for when all threads have agreed that a page range is closed. + class OnPagesClosed_Context : public IAsyncContext { + public: + OnPagesClosed_Context(alloc_t* allocator_, + Address new_safe_head_address_, + bool replace_with_clean_page_) + : allocator{ allocator_ } + , new_safe_head_address{ new_safe_head_address_ } + , replace_with_clean_page{ replace_with_clean_page_ } { + } + + /// The deep-copy constructor. + OnPagesClosed_Context(const OnPagesClosed_Context& other) + : allocator{ other.allocator } + , new_safe_head_address{ other.new_safe_head_address } + , replace_with_clean_page{ other.replace_with_clean_page } { + } + + protected: + Status DeepCopy_Internal(IAsyncContext*& context_copy) final { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + public: + alloc_t* allocator; + Address new_safe_head_address; + bool replace_with_clean_page; + }; + + static void OnPagesClosed(IAsyncContext* ctxt); + + /// Seal: make sure there are no longer any threads writing to the page + /// Flush: send page to secondary store + class OnPagesMarkedReadOnly_Context : public IAsyncContext { + public: + OnPagesMarkedReadOnly_Context(alloc_t* allocator_, + Address new_safe_read_only_address_, + bool wait_for_pending_flush_complete_) + : allocator{ allocator_ } + , new_safe_read_only_address{ new_safe_read_only_address_ } + , wait_for_pending_flush_complete{ wait_for_pending_flush_complete_ } { + } + + /// The deep-copy constructor. + OnPagesMarkedReadOnly_Context(const OnPagesMarkedReadOnly_Context& other) + : allocator{ other.allocator } + , new_safe_read_only_address{ other.new_safe_read_only_address } + , wait_for_pending_flush_complete{ other.wait_for_pending_flush_complete } { + } + + protected: + Status DeepCopy_Internal(IAsyncContext*& context_copy) final { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + public: + alloc_t* allocator; + Address new_safe_read_only_address; + bool wait_for_pending_flush_complete; + }; + + static void OnPagesMarkedReadOnly(IAsyncContext* ctxt); + + private: + inline void GetFileReadBoundaries(Address read_offset, uint32_t read_length, + uint64_t& begin_read, uint64_t& end_read, uint32_t& offset, + uint32_t& length) const { + assert(sector_size > 0); + assert(Utility::IsPowerOfTwo(sector_size)); + assert(sector_size <= UINT32_MAX); + size_t alignment_mask = sector_size - 1; + // Align read to sector boundary. + begin_read = read_offset.control() & ~alignment_mask; + end_read = (read_offset.control() + read_length + alignment_mask) & ~alignment_mask; + offset = static_cast(read_offset.control() & alignment_mask); + assert(end_read - begin_read <= UINT32_MAX); + length = static_cast(end_read - begin_read); + } + + /// Allocate memory page, in sector aligned form + inline void AllocatePage(uint32_t index); + + /// Used by several functions to update the variable to newValue. Ignores if newValue is smaller + /// than the current value. + template + inline bool MonotonicUpdate(A& variable, T new_value, + T& old_value) { + old_value = variable.load(); + while(old_value < new_value) { + if(variable.compare_exchange_strong(old_value, new_value)) { + return true; + } + } + return false; + } + + Status AsyncFlushPages(uint32_t start_page, Address until_address, + bool serialize_objects = false); + + public: + Status AsyncFlushPagesToFile(uint32_t start_page, Address until_address, file_t& file, + std::atomic& flush_pending); + + /// Recovery. + Status AsyncReadPagesFromLog(uint32_t start_page, uint32_t num_pages, + RecoveryStatus& recovery_status); + Status AsyncReadPagesFromSnapshot(file_t& snapshot_file, uint32_t file_start_page, + uint32_t start_page, uint32_t num_pages, + RecoveryStatus& recovery_status); + + Status AsyncFlushPage(uint32_t page, RecoveryStatus& recovery_status, + AsyncCallback caller_callback, IAsyncContext* caller_context); + void RecoveryReset(Address begin_address_, Address head_address_, Address tail_address); + + private: + template + Status AsyncReadPages(F& read_file, uint32_t file_start_page, uint32_t start_page, + uint32_t num_pages, RecoveryStatus& recovery_status); + inline void PageAlignedShiftHeadAddress(uint32_t tail_page); + inline void PageAlignedShiftReadOnlyAddress(uint32_t tail_page); + + /// Every async flush callback tries to update the flushed until address to the latest value + /// possible + /// Is there a better way to do this with enabling fine-grained addresses (not necessarily at + /// page boundaries)? + inline void ShiftFlushedUntilAddress() { + Address current_flushed_until_address = flushed_until_address.load(); + uint32_t page = current_flushed_until_address.page(); + + bool update = false; + Address page_last_flushed_address = PageStatus(page).LastFlushedUntilAddress.load(); + while(page_last_flushed_address >= current_flushed_until_address) { + current_flushed_until_address = page_last_flushed_address; + update = true; + ++page; + page_last_flushed_address = PageStatus(page).LastFlushedUntilAddress.load(); + } + + if(update) { + Address discard; + MonotonicUpdate(flushed_until_address, current_flushed_until_address, discard); + } + } + + public: + uint32_t sector_size; + + private: + LightEpoch* epoch_; + disk_t* disk; + + public: + log_file_t* file; + // Read buffer pool + NativeSectorAlignedBufferPool read_buffer_pool; + NativeSectorAlignedBufferPool io_buffer_pool; + + /// Every address < ReadOnlyAddress is read-only. + AtomicAddress read_only_address; + /// The minimum ReadOnlyAddress that every thread has seen. + AtomicAddress safe_read_only_address; + + /// The circular buffer can drop any page < HeadAddress.page()--must read those pages from disk. + AtomicAddress head_address; + /// The minimum HeadPage that every thread has seen. + AtomicAddress safe_head_address; + + AtomicAddress flushed_until_address; + + /// The address of the true head of the log--everything before this address has been truncated + /// by garbage collection. + AtomicAddress begin_address; + + private: + uint32_t buffer_size_; + + /// -- the latest N pages should be mutable. + uint32_t num_mutable_pages_; + + // Circular buffer definition + uint8_t** pages_; + + // Array that indicates the status of each buffer page + FullPageStatus* page_status_; + + // Global address of the current tail (next element to be allocated from the circular buffer) + AtomicPageOffset tail_page_offset_; +}; + +/// Implementations. +template +inline void PersistentMemoryMalloc::AllocatePage(uint32_t index) { + index = index % buffer_size_; + assert(pages_[index] == nullptr); + pages_[index] = reinterpret_cast(aligned_alloc(sector_size, kPageSize));; + std::memset(pages_[index], 0, kPageSize); + + // Mark the page as accessible. + page_status_[index].status.store(FlushStatus::Flushed, CloseStatus::Open); +} + +template +inline Address PersistentMemoryMalloc::Allocate(uint32_t num_slots, uint32_t& closed_page) { + closed_page = UINT32_MAX; + PageOffset page_offset = tail_page_offset_.Reserve(num_slots); + + if(page_offset.offset() + num_slots > kPageSize) { + // The current page is full. The caller should Refresh() the epoch and wait until + // NewPage() is successful before trying to Allocate() again. + closed_page = page_offset.page(); + return Address::kInvalidAddress; + } else { + assert(Page(page_offset.page())); + return static_cast
(page_offset); + } +} + +template +inline bool PersistentMemoryMalloc::NewPage(uint32_t old_page) { + assert(old_page < Address::kMaxPage); + PageOffset new_tail_offset{ old_page + 1, 0 }; + // When the tail advances to page k+1, we clear page k+2. + if(old_page + 2 >= safe_head_address.page() + buffer_size_) { + // No room in the circular buffer for a new page; try to advance the head address, to make + // more room available. + disk->TryComplete(); + PageAlignedShiftReadOnlyAddress(old_page + 1); + PageAlignedShiftHeadAddress(old_page + 1); + return false; + } + FlushCloseStatus status = PageStatus(old_page + 1).status.load(); + if(!status.Ready()) { + // Can't access the next page yet; try to advance the head address, to make the page + // available. + disk->TryComplete(); + PageAlignedShiftReadOnlyAddress(old_page + 1); + PageAlignedShiftHeadAddress(old_page + 1); + return false; + } + bool won_cas; + bool retval = tail_page_offset_.NewPage(old_page, won_cas); + if(won_cas) { + // We moved the tail to (page + 1), so we are responsible for moving the head and + // read-only addresses. + PageAlignedShiftReadOnlyAddress(old_page + 1); + PageAlignedShiftHeadAddress(old_page + 1); + if(!Page(old_page + 2)) { + // We are also responsible for allocating (page + 2). + AllocatePage(old_page + 2); + } + } + return retval; +} + +template +inline void PersistentMemoryMalloc::AsyncGetFromDisk(Address address, uint32_t num_records, + AsyncIOCallback callback, AsyncIOContext& context) { + uint64_t begin_read, end_read; + uint32_t offset, length; + GetFileReadBoundaries(address, num_records, begin_read, end_read, offset, length); + context.record = read_buffer_pool.Get(length); + context.record.valid_offset = offset; + context.record.available_bytes = length - offset; + context.record.required_bytes = num_records; + + file->ReadAsync(begin_read, context.record.buffer(), length, callback, context); +} + +template +Address PersistentMemoryMalloc::ShiftReadOnlyToTail() { + Address tail_address = GetTailAddress(); + Address old_read_only_address; + if(MonotonicUpdate(read_only_address, tail_address, old_read_only_address)) { + OnPagesMarkedReadOnly_Context context{ this, tail_address, false }; + IAsyncContext* context_copy; + Status result = context.DeepCopy(context_copy); + assert(result == Status::Ok); + epoch_->BumpCurrentEpoch(OnPagesMarkedReadOnly, context_copy); + } + return tail_address; +} + +template +void PersistentMemoryMalloc::Truncate(GcState::truncate_callback_t callback) { + assert(sector_size > 0); + assert(Utility::IsPowerOfTwo(sector_size)); + assert(sector_size <= UINT32_MAX); + size_t alignment_mask = sector_size - 1; + // Align read to sector boundary. + uint64_t begin_offset = begin_address.control() & ~alignment_mask; + file->Truncate(begin_offset, callback); +} + +template +void PersistentMemoryMalloc::OnPagesClosed(IAsyncContext* ctxt) { + CallbackContext context{ ctxt }; + Address old_safe_head_address; + if(context->allocator->MonotonicUpdate(context->allocator->safe_head_address, + context->new_safe_head_address, + old_safe_head_address)) { + for(uint32_t idx = old_safe_head_address.page(); idx < context->new_safe_head_address.page(); + ++idx) { + FlushCloseStatus old_status = context->allocator->PageStatus(idx).status.load(); + FlushCloseStatus new_status; + do { + new_status = FlushCloseStatus{ old_status.flush, CloseStatus::Closed }; + } while(!context->allocator->PageStatus(idx).status.compare_exchange_weak(old_status, + new_status)); + + if(old_status.flush == FlushStatus::Flushed) { + // We closed the page after it was flushed, so we are responsible for clearing and + // reopening it. + std::memset(context->allocator->Page(idx), 0, kPageSize); + context->allocator->PageStatus(idx).status.store(FlushStatus::Flushed, CloseStatus::Open); + } + } + } +} + +template +void PersistentMemoryMalloc::OnPagesMarkedReadOnly(IAsyncContext* ctxt) { + CallbackContext context{ ctxt }; + Address old_safe_read_only_address; + if(context->allocator->MonotonicUpdate(context->allocator->safe_read_only_address, + context->new_safe_read_only_address, + old_safe_read_only_address)) { + context->allocator->AsyncFlushPages(old_safe_read_only_address.page(), + context->new_safe_read_only_address); + } +} + +template +Status PersistentMemoryMalloc::AsyncFlushPages(uint32_t start_page, Address until_address, + bool serialize_objects) { + class Context : public IAsyncContext { + public: + Context(alloc_t* allocator_, uint32_t page_, Address until_address_) + : allocator{ allocator_ } + , page{ page_ } + , until_address{ until_address_ } { + } + /// The deep-copy constructor + Context(const Context& other) + : allocator{ other.allocator } + , page{ other.page } + , until_address{ other.until_address } { + } + protected: + Status DeepCopy_Internal(IAsyncContext*& context_copy) final { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + public: + alloc_t* allocator; + uint32_t page; + Address until_address; + }; + + auto callback = [](IAsyncContext* ctxt, Status result, size_t bytes_transferred) { + CallbackContext context{ ctxt }; + if(result != Status::Ok) { + fprintf(stderr, "AsyncFlushPages(), error: %u\n", static_cast(result)); + } + context->allocator->PageStatus(context->page).LastFlushedUntilAddress.store( + context->until_address); + //Set the page status to flushed + FlushCloseStatus old_status = context->allocator->PageStatus(context->page).status.load(); + FlushCloseStatus new_status; + do { + new_status = FlushCloseStatus{ FlushStatus::Flushed, old_status.close }; + } while(!context->allocator->PageStatus(context->page).status.compare_exchange_weak( + old_status, new_status)); + if(old_status.close == CloseStatus::Closed) { + // We finished flushing the page after it was closed, so we are responsible for clearing and + // reopening it. + std::memset(context->allocator->Page(context->page), 0, kPageSize); + context->allocator->PageStatus(context->page).status.store(FlushStatus::Flushed, + CloseStatus::Open); + } + context->allocator->ShiftFlushedUntilAddress(); + }; + + uint32_t num_pages = until_address.page() - start_page; + if(until_address.offset() > 0) { + ++num_pages; + } + assert(num_pages > 0); + + for(uint32_t flush_page = start_page; flush_page < start_page + num_pages; ++flush_page) { + Address page_start_address{ flush_page, 0 }; + Address page_end_address{ flush_page + 1, 0 }; + + Context context{ this, flush_page, std::min(page_end_address, until_address) }; + + //Set status to in-progress + FlushCloseStatus old_status = PageStatus(flush_page).status.load(); + FlushCloseStatus new_status; + do { + new_status = FlushCloseStatus{ FlushStatus::InProgress, old_status.close }; + } while(!PageStatus(flush_page).status.compare_exchange_weak(old_status, new_status)); + PageStatus(flush_page).LastFlushedUntilAddress.store(0); + + RETURN_NOT_OK(file->WriteAsync(Page(flush_page), kPageSize * flush_page, kPageSize, callback, + context)); + } + return Status::Ok; +} + +template +Status PersistentMemoryMalloc::AsyncFlushPagesToFile(uint32_t start_page, Address until_address, + file_t& file, std::atomic& flush_pending) { + class Context : public IAsyncContext { + public: + Context(std::atomic& flush_pending_) + : flush_pending{ flush_pending_ } { + } + /// The deep-copy constructor + Context(Context& other) + : flush_pending{ other.flush_pending } { + } + protected: + Status DeepCopy_Internal(IAsyncContext*& context_copy) final { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + public: + std::atomic& flush_pending; + }; + + auto callback = [](IAsyncContext* ctxt, Status result, size_t bytes_transferred) { + CallbackContext context{ ctxt }; + if(result != Status::Ok) { + fprintf(stderr, "AsyncFlushPagesToFile(), error: %u\n", static_cast(result)); + } + assert(context->flush_pending > 0); + --context->flush_pending; + }; + + uint32_t num_pages = until_address.page() - start_page; + if(until_address.offset() > 0) { + ++num_pages; + } + assert(num_pages > 0); + flush_pending = num_pages; + + for(uint32_t flush_page = start_page; flush_page < start_page + num_pages; ++flush_page) { + Address page_start_address{ flush_page, 0 }; + Address page_end_address{ flush_page + 1, 0 }; + Context context{ flush_pending }; + RETURN_NOT_OK(file.WriteAsync(Page(flush_page), kPageSize * (flush_page - start_page), + kPageSize, callback, context)); + } + return Status::Ok; +} + +template +Status PersistentMemoryMalloc::AsyncReadPagesFromLog(uint32_t start_page, uint32_t num_pages, + RecoveryStatus& recovery_status) { + return AsyncReadPages(*file, 0, start_page, num_pages, recovery_status); +} + +template +Status PersistentMemoryMalloc::AsyncReadPagesFromSnapshot(file_t& snapshot_file, + uint32_t file_start_page, uint32_t start_page, uint32_t num_pages, + RecoveryStatus& recovery_status) { + return AsyncReadPages(snapshot_file, file_start_page, start_page, num_pages, recovery_status); +} + +template +template +Status PersistentMemoryMalloc::AsyncReadPages(F& read_file, uint32_t file_start_page, + uint32_t start_page, uint32_t num_pages, RecoveryStatus& recovery_status) { + class Context : public IAsyncContext { + public: + Context(std::atomic& page_status_) + : page_status{ &page_status_ } { + } + /// The deep-copy constructor + Context(const Context& other) + : page_status{ other.page_status } { + } + protected: + Status DeepCopy_Internal(IAsyncContext*& context_copy) final { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + public: + std::atomic* page_status; + }; + + auto callback = [](IAsyncContext* ctxt, Status result, size_t bytes_transferred) { + CallbackContext context{ ctxt }; + if(result != Status::Ok) { + fprintf(stderr, "Error: %u\n", static_cast(result)); + } + assert(context->page_status->load() == PageRecoveryStatus::IssuedRead); + context->page_status->store(PageRecoveryStatus::ReadDone); + }; + + for(uint32_t read_page = start_page; read_page < start_page + num_pages; ++read_page) { + if(!Page(read_page)) { + // Allocate a new page. + AllocatePage(read_page); + } else { + // Clear an old used page. + std::memset(Page(read_page), 0, kPageSize); + } + assert(recovery_status.page_status(read_page) == PageRecoveryStatus::NotStarted); + recovery_status.page_status(read_page).store(PageRecoveryStatus::IssuedRead); + PageStatus(read_page).LastFlushedUntilAddress.store(Address{ read_page + 1, 0 }); + Context context{ recovery_status.page_status(read_page) }; + RETURN_NOT_OK(read_file.ReadAsync(kPageSize * (read_page - file_start_page), Page(read_page), + kPageSize, callback, context)); + } + return Status::Ok; +} + +template +Status PersistentMemoryMalloc::AsyncFlushPage(uint32_t page, RecoveryStatus& recovery_status, + AsyncCallback caller_callback, IAsyncContext* caller_context) { + class Context : public IAsyncContext { + public: + Context(std::atomic& page_status_, AsyncCallback caller_callback_, + IAsyncContext* caller_context_) + : page_status{ &page_status_ } + , caller_callback{ caller_callback_ } + , caller_context{ caller_context_ } { + } + /// The deep-copy constructor + Context(const Context& other, IAsyncContext* caller_context_copy) + : page_status{ other.page_status } + , caller_callback{ other.caller_callback } + , caller_context{ caller_context_copy } { + } + protected: + Status DeepCopy_Internal(IAsyncContext*& context_copy) final { + if(caller_callback) { + return IAsyncContext::DeepCopy_Internal(*this, caller_context, context_copy); + } else { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + } + public: + std::atomic* page_status; + AsyncCallback caller_callback; + IAsyncContext* caller_context; + }; + + auto callback = [](IAsyncContext* ctxt, Status result, size_t bytes_transferred) { + CallbackContext context{ ctxt }; + if(result != Status::Ok) { + fprintf(stderr, "Error: %u\n", static_cast(result)); + } + assert(context->page_status->load() == PageRecoveryStatus::IssuedFlush); + context->page_status->store(PageRecoveryStatus::FlushDone); + if(context->caller_callback) { + context->caller_callback(context->caller_context, result); + } + }; + + assert(recovery_status.page_status(page) == PageRecoveryStatus::ReadDone); + recovery_status.page_status(page).store(PageRecoveryStatus::IssuedFlush); + PageStatus(page).LastFlushedUntilAddress.store(Address{ page + 1, 0 }); + Context context{ recovery_status.page_status(page), caller_callback, caller_context }; + return file->WriteAsync(Page(page), kPageSize * page, kPageSize, callback, context); +} + +template +void PersistentMemoryMalloc::RecoveryReset(Address begin_address_, Address head_address_, + Address tail_address) { + begin_address.store(begin_address_); + tail_page_offset_.store(tail_address); + // issue read request to all pages until head lag + head_address.store(head_address_); + safe_head_address.store(head_address_); + + flushed_until_address.store(Address{ tail_address.page(), 0 }); + read_only_address.store(tail_address); + safe_read_only_address.store(tail_address); + + uint32_t start_page = head_address_.page(); + uint32_t end_page = tail_address.offset() == 0 ? tail_address.page() : tail_address.page() + 1; + if(!Page(end_page)) { + AllocatePage(end_page); + } + if(!Page(end_page + 1)) { + AllocatePage(end_page + 1); + } + + for(uint32_t idx = 0; idx < buffer_size_; ++idx) { + PageStatus(idx).status.store(FlushStatus::Flushed, CloseStatus::Open); + } +} + +template +inline void PersistentMemoryMalloc::PageAlignedShiftHeadAddress(uint32_t tail_page) { + //obtain local values of variables that can change + Address current_head_address = head_address.load(); + Address current_flushed_until_address = flushed_until_address.load(); + + if(tail_page <= (buffer_size_ - kNumHeadPages)) { + // Desired head address is <= 0. + return; + } + + Address desired_head_address{ tail_page - (buffer_size_ - kNumHeadPages), 0 }; + + if(current_flushed_until_address < desired_head_address) { + desired_head_address = Address{ current_flushed_until_address.page(), 0 }; + } + + Address old_head_address; + if(MonotonicUpdate(head_address, desired_head_address, old_head_address)) { + OnPagesClosed_Context context{ this, desired_head_address, false }; + IAsyncContext* context_copy; + Status result = context.DeepCopy(context_copy); + assert(result == Status::Ok); + epoch_->BumpCurrentEpoch(OnPagesClosed, context_copy); + } +} + +template +inline void PersistentMemoryMalloc::PageAlignedShiftReadOnlyAddress(uint32_t tail_page) { + Address current_read_only_address = read_only_address.load(); + if(tail_page <= num_mutable_pages_) { + // Desired read-only address is <= 0. + return; + } + + Address desired_read_only_address{ tail_page - num_mutable_pages_, 0 }; + Address old_read_only_address; + if(MonotonicUpdate(read_only_address, desired_read_only_address, old_read_only_address)) { + OnPagesMarkedReadOnly_Context context{ this, desired_read_only_address, false }; + IAsyncContext* context_copy; + Status result = context.DeepCopy(context_copy); + assert(result == Status::Ok); + epoch_->BumpCurrentEpoch(OnPagesMarkedReadOnly, context_copy); + } +} + +} +} // namespace FASTER::core diff --git a/cc/src/core/phase.h b/cc/src/core/phase.h new file mode 100644 index 000000000..9b3ab1dd1 --- /dev/null +++ b/cc/src/core/phase.h @@ -0,0 +1,51 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "alloc.h" +#include "async.h" +#include "constants.h" +#include "thread.h" +#include "utility.h" + +namespace FASTER { +namespace core { + +/// Phases, used internally by FASTER to keep track of how far along FASTER has gotten during +/// checkpoint, gc, and grow actions. +enum class Phase : uint8_t { + /// Checkpoint phases. + PREP_INDEX_CHKPT, + INDEX_CHKPT, + PREPARE, + IN_PROGRESS, + WAIT_PENDING, + WAIT_FLUSH, + REST, + PERSISTENCE_CALLBACK, + /// Garbage-collection phases. + /// - The log's begin-address has been shifted; finish all outstanding I/Os before trying to + /// truncate the log. + GC_IO_PENDING, + /// - The log has been truncated, but threads are still cleaning the hash table. + GC_IN_PROGRESS, + /// Grow-index phases. + /// - Each thread waits for all other threads to complete outstanding (synchronous) operations + /// against the hash table. + GROW_PREPARE, + /// - Each thread copies a chunk of the old hash table into the new hash table. + GROW_IN_PROGRESS, + INVALID +}; + +} +} // namespace FASTER::core \ No newline at end of file diff --git a/cc/src/core/record.h b/cc/src/core/record.h new file mode 100644 index 000000000..9117564fb --- /dev/null +++ b/cc/src/core/record.h @@ -0,0 +1,151 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include +#include +#include "address.h" +#include "auto_ptr.h" + +namespace FASTER { +namespace core { + +/// Record header, internal to FASTER. +class RecordInfo { + public: + RecordInfo(uint16_t checkpoint_version_, bool final_bit_, bool tombstone_, bool invalid_, + Address previous_address) + : checkpoint_version{ checkpoint_version_ } + , final_bit{ final_bit_ } + , tombstone{ tombstone_ } + , invalid{ invalid_ } + , previous_address_{ previous_address.control() } { + } + + RecordInfo(const RecordInfo& other) + : control_{ other.control_ } { + } + + inline bool IsNull() const { + return control_ == 0; + } + inline Address previous_address() const { + return Address{ previous_address_ }; + } + + union { + struct { + uint64_t previous_address_ : 48; + uint64_t checkpoint_version : 13; + uint64_t invalid : 1; + uint64_t tombstone : 1; + uint64_t final_bit : 1; + }; + + uint64_t control_; + }; +}; +static_assert(sizeof(RecordInfo) == 8, "sizeof(RecordInfo) != 8"); +static_assert(sizeof(RecordInfo) == 8, "sizeof(RecordInfo) != 8"); + +/// A record stored in the log. The log starts at 0 (mod 64), and consists of Records, one after +/// the other. Each record's header is 8 bytes. +template +struct Record { + // To support records with alignment > 64, modify the persistent-memory allocator to allocate + // a larger NULL page on startup. + static_assert(alignof(key_t) <= Constants::kCacheLineBytes, + "alignof(key_t) > Constants::kCacheLineBytes)"); + static_assert(alignof(value_t) <= Constants::kCacheLineBytes, + "alignof(value_t) > Constants::kCacheLineBytes)"); + + /// For placement new() operator. Can't set value, since it might be set by value = input (for + /// upsert), or rmw_initial(...) (for RMW). + Record(RecordInfo header_, const key_t& key_) + : header{ header_ } { + void* buffer = const_cast(&key()); + new(buffer)key_t{ key_ }; + } + + /// Key appears immediately after record header (subject to alignment padding). Keys are + /// immutable. + inline constexpr const key_t& key() const { + const uint8_t* head = reinterpret_cast(this); + size_t offset = pad_alignment(sizeof(RecordInfo), alignof(key_t)); + return *reinterpret_cast(head + offset); + } + + /// Value appears immediately after key (subject to alignment padding). Values can be modified. + inline constexpr const value_t& value() const { + const uint8_t* head = reinterpret_cast(this); + size_t offset = pad_alignment(key().size() + + pad_alignment(sizeof(RecordInfo), alignof(key_t)), + alignof(value_t)); + return *reinterpret_cast(head + offset); + } + inline constexpr value_t& value() { + uint8_t* head = reinterpret_cast(this); + size_t offset = pad_alignment(key().size() + + pad_alignment(sizeof(RecordInfo), alignof(key_t)), + alignof(value_t)); + return *reinterpret_cast(head + offset); + } + + /// Size of a record to be created, in memory. (Includes padding, if any, after the value, so + /// that the next record stored in the log is properly aligned.) + static inline constexpr uint32_t size(const key_t& key_, uint32_t value_size) { + return static_cast( + // --plus Value size, all padded to Header alignment. + pad_alignment(value_size + + // --plus Key size, all padded to Value alignment. + pad_alignment(key_.size() + + // Header, padded to Key alignment. + pad_alignment(sizeof(RecordInfo), alignof(key_t)), + alignof(value_t)), + alignof(RecordInfo))); + } + /// Size of the existing record, in memory. (Includes padding, if any, after the value.) + inline constexpr uint32_t size() const { + return size(key(), value().size()); + } + + /// Minimum size of a read from disk that is guaranteed to include the record's header + whatever + /// information class key_t needs to determine its key size. + static inline constexpr uint32_t min_disk_key_size() { + return static_cast( + // -- plus sizeof(key_t). + sizeof(key_t) + + // Header size, padded to Key alignment. + pad_alignment(sizeof(RecordInfo), alignof(key_t))); + } + + /// Minimum size of a read from disk that is guaranteed to include the record's header, key, + // and whatever information the host needs to determine the value size. + inline constexpr uint32_t min_disk_value_size() const { + return static_cast( + // -- plus size of the Value's header. + sizeof(value_t) + + // --plus Key size, padded to Base Value alignment. + pad_alignment(key().size() + + // Header, padded to Key alignment. + pad_alignment(sizeof(RecordInfo), alignof(key_t)), + alignof(value_t)) + ); + } + + /// Size of a record, on disk. (Excludes padding, if any, after the value.) + inline constexpr uint32_t disk_size() const { + return static_cast(value().size() + + pad_alignment(key().size() + + // Header, padded to Key alignment. + pad_alignment(sizeof(RecordInfo), alignof(key_t)), + alignof(value_t))); + } + + public: + RecordInfo header; +}; + +} +} // namespace FASTER::core diff --git a/cc/src/core/recovery_status.h b/cc/src/core/recovery_status.h new file mode 100644 index 000000000..111bfade1 --- /dev/null +++ b/cc/src/core/recovery_status.h @@ -0,0 +1,59 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include +#include + +namespace FASTER { +namespace core { + +/// Used by FASTER to track status, during recovery action. + +enum class PageRecoveryStatus { + NotStarted = 0, + IssuedRead, + ReadDone, + IssuedFlush, + FlushDone +}; + +class RecoveryStatus { + public: + RecoveryStatus(uint32_t start_page_, uint32_t end_page_) + : start_page{ start_page_ } + , end_page{ end_page_ } + , page_status_{ nullptr } { + assert(end_page >= start_page); + uint32_t buffer_size = end_page - start_page; + page_status_ = new std::atomic[buffer_size]; + std::memset(page_status_, 0, sizeof(std::atomic) * buffer_size); + } + + ~RecoveryStatus() { + if(page_status_) { + delete page_status_; + } + } + + const std::atomic& page_status(uint32_t page) const { + assert(page >= start_page); + assert(page < end_page); + return page_status_[page - start_page]; + } + std::atomic& page_status(uint32_t page) { + assert(page >= start_page); + assert(page < end_page); + return page_status_[page - start_page]; + } + + uint32_t start_page; + uint32_t end_page; + + private: + std::atomic* page_status_; +}; + +} +} // namespace FASTER::core diff --git a/cc/src/core/state_transitions.h b/cc/src/core/state_transitions.h new file mode 100644 index 000000000..5796fb271 --- /dev/null +++ b/cc/src/core/state_transitions.h @@ -0,0 +1,162 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include +#include +#include "phase.h" + +namespace FASTER { +namespace core { + +struct ResizeInfo { + uint8_t version; +}; + +/// Each FASTER store can perform only one action at a time (checkpoint, recovery, garbage +// collect, or grow index). +enum class Action : uint8_t { + None = 0, + Checkpoint, + Recover, + GC, + GrowIndex +}; + +struct SystemState { + SystemState(Action action_, Phase phase_, uint32_t version_) + : control_{ 0 } { + action = action_; + phase = phase_; + version = version_; + } + SystemState(uint64_t control) + : control_{ control } { + } + SystemState(const SystemState& other) + : control_{ other.control_ } { + } + + inline SystemState& operator=(const SystemState& other) { + control_ = other.control_; + return *this; + } + inline bool operator==(const SystemState& other) { + return control_ == other.control_; + } + inline bool operator!=(const SystemState& other) { + return control_ != other.control_; + } + + /// The state transitions. + inline SystemState GetNextState() const { + switch(action) { + case Action::Checkpoint: + switch(phase) { + case Phase::REST: + return SystemState{ Action::Checkpoint, Phase::PREP_INDEX_CHKPT, version }; + case Phase::PREP_INDEX_CHKPT: + return SystemState{ Action::Checkpoint, Phase::INDEX_CHKPT, version }; + case Phase::INDEX_CHKPT: + return SystemState{ Action::Checkpoint, Phase::PREPARE, version }; + case Phase::PREPARE: + return SystemState{ Action::Checkpoint, Phase::IN_PROGRESS, version + 1 }; + case Phase::IN_PROGRESS: + return SystemState{ Action::Checkpoint, Phase::WAIT_PENDING, version }; + case Phase::WAIT_PENDING: + return SystemState{ Action::Checkpoint, Phase::WAIT_FLUSH, version }; + case Phase::WAIT_FLUSH: + return SystemState{ Action::Checkpoint, Phase::PERSISTENCE_CALLBACK, version }; + case Phase::PERSISTENCE_CALLBACK: + return SystemState{ Action::Checkpoint, Phase::REST, version }; + default: + // not reached + assert(false); + return SystemState(UINT64_MAX); + } + break; + case Action::GC: + switch(phase) { + case Phase::REST: + return SystemState{ Action::GC, Phase::GC_IO_PENDING, version }; + case Phase::GC_IO_PENDING: + return SystemState{ Action::GC, Phase::GC_IN_PROGRESS, version }; + case Phase::GC_IN_PROGRESS: + return SystemState{ Action::GC, Phase::REST, version }; + default: + // not reached + assert(false); + return SystemState(UINT64_MAX); + } + break; + case Action::GrowIndex: + switch(phase) { + case Phase::REST: + return SystemState{ Action::GrowIndex, Phase::GROW_PREPARE, version }; + case Phase::GROW_PREPARE: + return SystemState{ Action::GrowIndex, Phase::GROW_IN_PROGRESS, version }; + case Phase::GROW_IN_PROGRESS: + return SystemState{ Action::GrowIndex, Phase::REST, version }; + default: + // not reached + assert(false); + return SystemState(UINT64_MAX); + } + default: + // not reached + assert(false); + return SystemState(UINT64_MAX); + } + } + + union { + struct { + /// Action being performed (checkpoint, recover, or gc). + Action action; + /// Phase of that action currently being executed. + Phase phase; + /// Checkpoint version (used for CPR). + uint32_t version; + }; + uint64_t control_; + }; +}; +static_assert(sizeof(SystemState) == 8, "sizeof(SystemState) != 8"); + +class AtomicSystemState { + public: + AtomicSystemState(Action action_, Phase phase_, uint32_t version_) { + SystemState state{ action_, phase_, version_ }; + control_.store(state.control_); + } + + /// Atomic access. + inline SystemState load() const { + return SystemState{ control_.load() }; + } + inline void store(SystemState value) { + control_.store(value.control_); + } + inline bool compare_exchange_strong(SystemState& expected, SystemState desired) { + uint64_t expected_control = expected.control_; + bool result = control_.compare_exchange_strong(expected_control, desired.control_); + expected = SystemState{ expected_control }; + return result; + } + + /// Accessors. + inline Phase phase() const { + return load().phase; + } + inline uint32_t version() const { + return load().version; + } + + private: + /// Atomic access to the system state. + std::atomic control_; +}; + +} +} // namespace FASTER::core diff --git a/cc/src/core/status.h b/cc/src/core/status.h new file mode 100644 index 000000000..0ecb48bed --- /dev/null +++ b/cc/src/core/status.h @@ -0,0 +1,30 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once +#include + +namespace FASTER { +namespace core { + +enum class Status : uint8_t { + Ok = 0, + Pending = 1, + NotFound = 2, + OutOfMemory = 3, + IOError = 4, + Corruption = 5, + Aborted = 6, +}; + +enum class InternalStatus : uint8_t { + Ok, + RETRY_NOW, + RETRY_LATER, + RECORD_ON_DISK, + SUCCESS_UNMARK, + CPR_SHIFT_DETECTED +}; + +} +} // namespace FASTER::core diff --git a/cc/src/core/thread.cc b/cc/src/core/thread.cc new file mode 100644 index 000000000..2ebefa0ba --- /dev/null +++ b/cc/src/core/thread.cc @@ -0,0 +1,26 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#include "thread.h" + +namespace FASTER { +namespace core { + +/// The first thread will have index 0. +std::atomic Thread::next_index_{ 0 }; + +/// No thread IDs have been used yet. +std::atomic Thread::id_used_[kMaxNumThreads] = {}; + +#ifdef COUNT_ACTIVE_THREADS +std::atomic Thread::current_num_threads_ { 0 }; +#endif + +/// Give the new thread an ID. (In this implementation, threads get IDs when they are created, and +/// release them when they are freed. We will eventually merge chkulk's improvements, from another +/// branch, and then threads will get IDs on their first call to FasterKv::StartSession(), while +/// still releasing IDs when they are freed.) +thread_local Thread::ThreadId Thread::id_{}; + +} +} // namespace FASTER::core diff --git a/cc/src/core/thread.h b/cc/src/core/thread.h new file mode 100644 index 000000000..24af3474a --- /dev/null +++ b/cc/src/core/thread.h @@ -0,0 +1,103 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include +#include +#include +#include +#include + +/// Turn this on to have Thread::current_num_threads_ keep a count of currently-active threads. +#undef COUNT_ACTIVE_THREADS + +namespace FASTER { +namespace core { + +/// Gives every thread a unique, numeric thread ID, and recycles IDs when threads exit. +class Thread { + public: + /// The number of entries in table. Currently, this is fixed at 64 and never changes or grows. + /// If the table runs out of entries, then the current implementation will throw a + /// std::runtime_error. + static constexpr size_t kMaxNumThreads = 96; + + private: + /// Encapsulates a thread ID, getting a free ID from the Thread class when the thread starts, and + /// releasing it back to the Thread class, when the thread exits. + class ThreadId { + public: + static constexpr uint32_t kInvalidId = UINT32_MAX; + + inline ThreadId(); + inline ~ThreadId(); + + inline uint32_t id() const { + return id_; + } + + private: + uint32_t id_; + }; + + public: + /// Call static method Thread::id() to get the executing thread's ID. + inline static uint32_t id() { + return id_.id(); + } + + private: + /// Methods ReserveEntry() and ReleaseEntry() do the real work. + inline static uint32_t ReserveEntry() { +#ifdef COUNT_ACTIVE_THREADS + int32_t result = ++current_num_threads_; + assert(result < kMaxNumThreads); +#endif + uint32_t start = next_index_++; + uint32_t end = start + 2 * kMaxNumThreads; + for(uint32_t id = start; id < end; ++id) { + bool expected = false; + if(id_used_[id % kMaxNumThreads].compare_exchange_strong(expected, true)) { + return id % kMaxNumThreads; + } + } + // Already have 64 active threads. + throw std::runtime_error{ "Too many threads!" }; + } + + inline static void ReleaseEntry(uint32_t id) { + assert(id != ThreadId::kInvalidId); + assert(id_used_[id].load()); + id_used_[id] = false; +#ifdef COUNT_ACTIVE_THREADS + int32_t result = --current_num_threads_; +#endif + } + + /// The current thread's page_index. + static thread_local ThreadId id_; + + /// Next thread index to consider. + static std::atomic next_index_; + /// Which thread IDs have already been taken. + static std::atomic id_used_[kMaxNumThreads]; + +#ifdef COUNT_ACTIVE_THREADS + static std::atomic current_num_threads_; +#endif + + friend class ThreadId; +}; + +inline Thread::ThreadId::ThreadId() + : id_{ kInvalidId } { + id_ = Thread::ReserveEntry(); +} + +inline Thread::ThreadId::~ThreadId() { + Thread::ReleaseEntry(id_); +} + +} +} // namespace FASTER::core diff --git a/cc/src/core/utility.h b/cc/src/core/utility.h new file mode 100644 index 000000000..83211bddd --- /dev/null +++ b/cc/src/core/utility.h @@ -0,0 +1,56 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include +#include +#include +#include + +namespace FASTER { +namespace core { + +class Utility { + public: + static inline uint64_t Rotr64(uint64_t x, std::size_t n) { + return (((x) >> n) | ((x) << (64 - n))); + } + + static inline uint64_t GetHashCode(uint64_t input) { + uint64_t local_rand = input; + uint64_t local_rand_hash = 8; + local_rand_hash = 40343 * local_rand_hash + ((local_rand) & 0xFFFF); + local_rand_hash = 40343 * local_rand_hash + ((local_rand >> 16) & 0xFFFF); + local_rand_hash = 40343 * local_rand_hash + ((local_rand >> 32) & 0xFFFF); + local_rand_hash = 40343 * local_rand_hash + (local_rand >> 48); + local_rand_hash = 40343 * local_rand_hash; + return Rotr64(local_rand_hash, 43); + //Func hash = + // e => 40343 * (40343 * (40343 * (40343 * (40343 * 8 + (long)((e) & 0xFFFF)) + (long)((e >> 16) & 0xFFFF)) + (long)((e >> 32) & 0xFFFF)) + (long)(e >> 48)); + } + + static inline uint64_t HashBytes(const uint16_t* str, size_t len) { + // 40343 is a "magic constant" that works well, + // 38299 is another good value. + // Both are primes and have a good distribution of bits. + const uint64_t kMagicNum = 40343; + uint64_t hashState = len; + + for(size_t idx = 0; idx < len; ++idx) { + hashState = kMagicNum * hashState + str[idx]; + } + + // The final scrambling helps with short keys that vary only on the high order bits. + // Low order bits are not always well distributed so shift them to the high end, where they'll + // form part of the 14-bit tag. + return Rotr64(kMagicNum * hashState, 6); + } + + static constexpr inline bool IsPowerOfTwo(uint64_t x) { + return (x > 0) && ((x & (x - 1)) == 0); + } +}; + +} +} // namespace FASTER::core diff --git a/cc/src/device/file_system_disk.h b/cc/src/device/file_system_disk.h new file mode 100644 index 000000000..3c076e1d1 --- /dev/null +++ b/cc/src/device/file_system_disk.h @@ -0,0 +1,527 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include +#include +#include +#include + +#include "../core/gc_state.h" +#include "../core/light_epoch.h" +#include "../core/utility.h" +#include "../environment/file.h" + +/// Wrapper that exposes files to FASTER. Encapsulates segmented files, etc. + +namespace FASTER { +namespace device { + +template +class FileSystemDisk; + +template +class FileSystemFile { + public: + typedef H handler_t; + typedef typename handler_t::async_file_t file_t; + + /// Default constructor + FileSystemFile() + : file_{} + , file_options_{} { + } + + FileSystemFile(const std::string& filename, const environment::FileOptions& file_options) + : file_{ filename } + , file_options_{ file_options } { + } + + /// Move constructor. + FileSystemFile(FileSystemFile&& other) + : file_{ std::move(other.file_) } + , file_options_{ other.file_options_ } { + } + + /// Move assignment operator. + FileSystemFile& operator=(FileSystemFile&& other) { + file_ = std::move(other.file_); + file_options_ = other.file_options_; + return *this; + } + + Status Open(handler_t* handler) { + return file_.Open(FASTER::environment::FileCreateDisposition::OpenOrCreate, file_options_, + handler, nullptr); + } + Status Close() { + return file_.Close(); + } + Status Delete() { + return file_.Delete(); + } + void Truncate(uint64_t new_begin_offset, GcState::truncate_callback_t callback) { + // Truncation is a no-op. + if(callback) { + callback(new_begin_offset); + } + } + + Status ReadAsync(uint64_t source, void* dest, uint32_t length, + AsyncIOCallback callback, IAsyncContext& context) const { + return file_.Read(source, length, reinterpret_cast(dest), context, callback); + } + Status WriteAsync(const void* source, uint64_t dest, uint32_t length, + AsyncIOCallback callback, IAsyncContext& context) { + return file_.Write(dest, length, reinterpret_cast(source), context, callback); + } + + size_t alignment() const { + return file_.device_alignment(); + } + + private: + file_t file_; + environment::FileOptions file_options_; +}; + +/// Manages a bundle of segment files. +template +class FileSystemSegmentBundle { + public: + typedef H handler_t; + typedef FileSystemFile file_t; + typedef FileSystemSegmentBundle bundle_t; + + FileSystemSegmentBundle(const std::string& filename, + const environment::FileOptions& file_options, handler_t* handler, + uint64_t begin_segment_, uint64_t end_segment_) + : filename_{ filename } + , file_options_{ file_options } + , begin_segment{ begin_segment_ } + , end_segment{ end_segment_ } + , owner_{ true } { + for(uint64_t idx = begin_segment; idx < end_segment; ++idx) { + new(files() + (idx - begin_segment)) file_t{ filename_ + std::to_string(idx), + file_options_ }; + Status result = file(idx).Open(handler); + assert(result == Status::Ok); + } + } + + FileSystemSegmentBundle(handler_t* handler, uint64_t begin_segment_, uint64_t end_segment_, + bundle_t& other) + : filename_{ std::move(other.filename_) } + , file_options_{ other.file_options_ } + , begin_segment{ begin_segment_ } + , end_segment{ end_segment_ } + , owner_{ true } { + assert(end_segment >= other.end_segment); + + uint64_t begin_new = begin_segment; + uint64_t begin_copy = std::max(begin_segment, other.begin_segment); + uint64_t end_copy = std::min(end_segment, other.end_segment); + uint64_t end_new = end_segment; + + for(uint64_t idx = begin_segment; idx < begin_copy; ++idx) { + new(files() + (idx - begin_segment)) file_t{ filename_ + std::to_string(idx), + file_options_ }; + Status result = file(idx).Open(handler); + assert(result == Status::Ok); + } + for(uint64_t idx = begin_copy; idx < end_copy; ++idx) { + // Move file handles for segments already opened. + new(files() + (idx - begin_segment)) file_t{ std::move(other.file(idx)) }; + } + for(uint64_t idx = end_copy; idx < end_new; ++idx) { + new(files() + (idx - begin_segment)) file_t{ filename_ + std::to_string(idx), + file_options_ }; + Status result = file(idx).Open(handler); + assert(result == Status::Ok); + } + + other.owner_ = false; + } + + ~FileSystemSegmentBundle() { + if(owner_) { + for(uint64_t idx = begin_segment; idx < end_segment; ++idx) { + file(idx).~file_t(); + } + } + } + + Status Close() { + assert(owner_); + Status result = Status::Ok; + for(uint64_t idx = begin_segment; idx < end_segment; ++idx) { + Status r = file(idx).Close(); + if(r != Status::Ok) { + // We'll report the last error. + result = r; + } + } + return result; + } + + Status Delete() { + assert(owner_); + Status result = Status::Ok; + for(uint64_t idx = begin_segment; idx < end_segment; ++idx) { + Status r = file(idx).Delete(); + if(r != Status::Ok) { + // We'll report the last error. + result = r; + } + } + return result; + } + + file_t* files() { + return reinterpret_cast(this + 1); + } + file_t& file(uint64_t segment) { + assert(segment >= begin_segment); + return files()[segment - begin_segment]; + } + bool exists(uint64_t segment) const { + return segment >= begin_segment && segment < end_segment; + } + + static constexpr uint64_t size(uint64_t num_segments) { + return sizeof(bundle_t) + num_segments * sizeof(file_t); + } + + public: + const uint64_t begin_segment; + const uint64_t end_segment; + private: + std::string filename_; + environment::FileOptions file_options_; + bool owner_; +}; + +template +class FileSystemSegmentedFile { + public: + typedef H handler_t; + typedef FileSystemFile file_t; + typedef FileSystemSegmentBundle bundle_t; + + static constexpr uint64_t kSegmentSize = S; + static_assert(Utility::IsPowerOfTwo(S), "template parameter S is not a power of two!"); + + FileSystemSegmentedFile(const std::string& filename, + const environment::FileOptions& file_options, LightEpoch* epoch) + : begin_segment_{ 0 } + , files_{ nullptr } + , handler_{ nullptr } + , filename_{ filename } + , file_options_{ file_options } + , epoch_{ epoch } { + } + + ~FileSystemSegmentedFile() { + bundle_t* files = files_.load(); + if(files) { + files->~bundle_t(); + std::free(files); + } + } + + Status Open(handler_t* handler) { + handler_ = handler; + return Status::Ok; + } + Status Close() { + return (files_) ? files_->Close() : Status::Ok; + } + Status Delete() { + return (files_) ? files_->Delete() : Status::Ok; + } + void Truncate(uint64_t new_begin_offset, GcState::truncate_callback_t callback) { + uint64_t new_begin_segment = new_begin_offset / kSegmentSize; + begin_segment_ = new_begin_segment; + TruncateSegments(new_begin_segment, callback); + } + + Status ReadAsync(uint64_t source, void* dest, uint32_t length, AsyncIOCallback callback, + IAsyncContext& context) const { + uint64_t segment = source / kSegmentSize; + assert(source % kSegmentSize + length <= kSegmentSize); + + bundle_t* files = files_.load(); + + if(!files || !files->exists(segment)) { + Status result = const_cast*>(this)->OpenSegment(segment); + if(result != Status::Ok) { + return result; + } + files = files_.load(); + } + return files->file(segment).ReadAsync(source % kSegmentSize, dest, length, callback, context); + } + + Status WriteAsync(const void* source, uint64_t dest, uint32_t length, + AsyncIOCallback callback, IAsyncContext& context) { + uint64_t segment = dest / kSegmentSize; + assert(dest % kSegmentSize + length <= kSegmentSize); + + bundle_t* files = files_.load(); + + if(!files || !files->exists(segment)) { + Status result = OpenSegment(segment); + if(result != Status::Ok) { + return result; + } + files = files_.load(); + } + return files->file(segment).WriteAsync(source, dest % kSegmentSize, length, callback, context); + } + + size_t alignment() const { + return 512; // For now, assume all disks have 512-bytes alignment. + } + + private: + Status OpenSegment(uint64_t segment) { + class Context : public IAsyncContext { + public: + Context(void* files_) + : files{ files_ } { + } + /// The deep-copy constructor. + Context(const Context& other) + : files{ other.files} { + } + protected: + Status DeepCopy_Internal(IAsyncContext*& context_copy) final { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + public: + void* files; + }; + + auto callback = [](IAsyncContext* ctxt) { + CallbackContext context{ ctxt }; + std::free(context->files); + }; + + // Only one thread can modify the list of files at a given time. + std::lock_guard lock{ mutex_ }; + bundle_t* files = files_.load(); + + if(segment < begin_segment_) { + // The requested segment has been truncated. + return Status::IOError; + } + if(files && files->exists(segment)) { + // Some other thread already opened this segment for us. + return Status::Ok; + } + + if(!files) { + // First segment opened. + void* buffer = std::malloc(bundle_t::size(1)); + bundle_t* new_files = new(buffer) bundle_t{ filename_, file_options_, handler_, + segment, segment + 1 }; + files_.store(new_files); + return Status::Ok; + } + + // Expand the list of files_. + uint64_t new_begin_segment = std::min(files->begin_segment, segment); + uint64_t new_end_segment = std::max(files->end_segment, segment + 1); + void* buffer = std::malloc(bundle_t::size(new_end_segment - new_begin_segment)); + bundle_t* new_files = new(buffer) bundle_t{ handler_, new_begin_segment, new_end_segment, + *files }; + files_.store(new_files); + // Delete the old list only after all threads have finished looking at it. + Context context{ files }; + IAsyncContext* context_copy; + Status result = context.DeepCopy(context_copy); + assert(result == Status::Ok); + epoch_->BumpCurrentEpoch(callback, context_copy); + return Status::Ok; + } + + void TruncateSegments(uint64_t new_begin_segment, GcState::truncate_callback_t caller_callback) { + class Context : public IAsyncContext { + public: + Context(bundle_t* files_, uint64_t new_begin_segment_, + GcState::truncate_callback_t caller_callback_) + : files{ files_ } + , new_begin_segment{ new_begin_segment_ } + , caller_callback{ caller_callback_ } { + } + /// The deep-copy constructor. + Context(const Context& other) + : files{ other.files } + , new_begin_segment{ other.new_begin_segment } + , caller_callback{ other.caller_callback } { + } + protected: + Status DeepCopy_Internal(IAsyncContext*& context_copy) final { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + public: + bundle_t* files; + uint64_t new_begin_segment; + GcState::truncate_callback_t caller_callback; + }; + + auto callback = [](IAsyncContext* ctxt) { + CallbackContext context{ ctxt }; + for(uint64_t idx = context->files->begin_segment; idx < context->new_begin_segment; ++idx) { + file_t& file = context->files->file(idx); + file.Close(); + file.Delete(); + } + std::free(context->files); + if(context->caller_callback) { + context->caller_callback(context->new_begin_segment * kSegmentSize); + } + }; + + // Only one thread can modify the list of files at a given time. + std::lock_guard lock{ mutex_ }; + bundle_t* files = files_.load(); + assert(files); + if(files->begin_segment >= new_begin_segment) { + // Segments have already been truncated. + if(caller_callback) { + caller_callback(files->begin_segment * kSegmentSize); + } + return; + } + + // Make a copy of the list, excluding the files to be truncated. + void* buffer = std::malloc(bundle_t::size(files->end_segment - new_begin_segment)); + bundle_t* new_files = new(buffer) bundle_t{ handler_, new_begin_segment, files->end_segment, + *files }; + files_.store(new_files); + // Delete the old list only after all threads have finished looking at it. + Context context{ files, new_begin_segment, caller_callback }; + IAsyncContext* context_copy; + Status result = context.DeepCopy(context_copy); + assert(result == Status::Ok); + epoch_->BumpCurrentEpoch(callback, context_copy); + } + + std::atomic begin_segment_; + std::atomic files_; + handler_t* handler_; + std::string filename_; + environment::FileOptions file_options_; + LightEpoch* epoch_; + std::mutex mutex_; +}; + +template +class FileSystemDisk { + public: + typedef H handler_t; + typedef FileSystemFile file_t; + typedef FileSystemSegmentedFile log_file_t; + + private: + static std::string NormalizePath(std::string root_path) { + if(root_path.empty() || root_path.back() != FASTER::environment::kPathSeparator[0]) { + root_path += FASTER::environment::kPathSeparator; + } + return root_path; + } + + public: + FileSystemDisk(const std::string& root_path, LightEpoch& epoch, bool enablePrivileges = false, + bool unbuffered = true, bool delete_on_close = false) + : root_path_{ NormalizePath(root_path) } + , handler_{ 16 /*max threads*/ } + , default_file_options_{ unbuffered, delete_on_close } + , log_{ root_path_ + "log.log", default_file_options_, &epoch} { + Status result = log_.Open(&handler_); + assert(result == Status::Ok); + } + + /// Methods required by the (implicit) disk interface. + uint32_t sector_size() const { + return static_cast(log_.alignment()); + } + + const log_file_t& log() const { + return log_; + } + log_file_t& log() { + return log_; + } + + std::string relative_index_checkpoint_path(uint32_t version) const { + std::string retval = "index-checkpoints"; + retval += FASTER::environment::kPathSeparator; + retval += std::to_string(version); + retval += FASTER::environment::kPathSeparator; + return retval; + } + std::string index_checkpoint_path(uint32_t version) const { + return root_path_ + relative_index_checkpoint_path(version); + } + + std::string relative_cpr_checkpoint_path(uint32_t version) const { + std::string retval = "cpr-checkpoints"; + retval += FASTER::environment::kPathSeparator; + retval += std::to_string(version); + retval += FASTER::environment::kPathSeparator; + return retval; + } + std::string cpr_checkpoint_path(uint32_t version) const { + return root_path_ + relative_cpr_checkpoint_path(version); + } + + void CreateIndexCheckpointDirectory(uint32_t version) { + std::string index_dir = index_checkpoint_path(version); + std::experimental::filesystem::path path{ index_dir }; + try { + std::experimental::filesystem::remove_all(path); + } catch(std::experimental::filesystem::filesystem_error&) { + // Ignore; throws when path doesn't exist yet. + } + std::experimental::filesystem::create_directories(path); + } + + void CreateCprCheckpointDirectory(uint32_t version) { + std::string cpr_dir = cpr_checkpoint_path(version); + std::experimental::filesystem::path path{ cpr_dir }; + try { + std::experimental::filesystem::remove_all(path); + } catch(std::experimental::filesystem::filesystem_error&) { + // Ignore; throws when path doesn't exist yet. + } + std::experimental::filesystem::create_directories(path); + } + + file_t NewFile(const std::string& relative_path) { + return file_t{ root_path_ + relative_path, default_file_options_ }; + } + + /// Implementation-specific accessor. + handler_t& handler() { + return handler_; + } + + bool TryComplete() { + return handler_.TryComplete(); + } + + private: + std::string root_path_; + handler_t handler_; + + environment::FileOptions default_file_options_; + + /// Store the log (contains all records). + log_file_t log_; +}; + +} +} // namespace FASTER::device diff --git a/cc/src/device/null_disk.h b/cc/src/device/null_disk.h new file mode 100644 index 000000000..0034fa290 --- /dev/null +++ b/cc/src/device/null_disk.h @@ -0,0 +1,124 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include +#include +#include + +#include "../core/gc_state.h" +#include "../core/light_epoch.h" +#include "../environment/file.h" + +namespace FASTER { +namespace device { + +/// A dummy (null) disk, used when you want an in-memory-only FASTER store. + +struct NullHandler { +}; + +class NullFile { + public: + Status Open(NullHandler* handler) { + return Status::Ok; + } + Status Close() { + return Status::Ok; + } + Status Delete() { + return Status::Ok; + } + void Truncate(uint64_t new_begin_offset, GcState::truncate_callback_t callback) { + if(callback) { + callback(new_begin_offset); + } + } + + Status ReadAsync(uint64_t source, void* dest, uint32_t length, + AsyncIOCallback callback, IAsyncContext& context) const { + callback(&context, Status::Ok, length); + return Status::Ok; + } + Status WriteAsync(const void* source, uint64_t dest, uint32_t length, + AsyncIOCallback callback, IAsyncContext& context) { + callback(&context, Status::Ok, length); + return Status::Ok; + } + + static size_t alignment() { + // Align null device to cache line. + return 64; + } + + void set_handler(NullHandler* handler) { + } +}; + +class NullDisk { + public: + typedef NullHandler handler_t; + typedef NullFile file_t; + typedef NullFile log_file_t; + + NullDisk(const std::string& filename, LightEpoch& epoch) { + } + + static uint32_t sector_size() { + return 64; + } + + /// Methods required by the (implicit) disk interface. + const file_t& log() const { + return log_; + } + file_t& log() { + return log_; + } + + std::string relative_index_checkpoint_path(uint32_t version) const { + assert(false); + return ""; + } + std::string index_checkpoint_path(uint32_t version) const { + assert(false); + return ""; + } + + std::string relative_cpr_checkpoint_path(uint32_t version) const { + assert(false); + return ""; + } + std::string cpr_checkpoint_path(uint32_t version) const { + assert(false); + return ""; + } + + void CreateIndexCheckpointDirectory(uint32_t version) { + assert(false); + } + void CreateCprCheckpointDirectory(uint32_t version) { + assert(false); + } + + file_t NewFile(const std::string& relative_path) { + assert(false); + return file_t{}; + } + + handler_t& handler() { + return handler_; + } + + inline static constexpr bool TryComplete() { + return false; + } + + private: + handler_t handler_; + file_t log_; +}; + +} +} // namespace FASTER::device \ No newline at end of file diff --git a/cc/src/environment/file.h b/cc/src/environment/file.h new file mode 100644 index 000000000..34cac012c --- /dev/null +++ b/cc/src/environment/file.h @@ -0,0 +1,10 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#ifdef _WIN32 +#include "file_windows.h" +#else +#include "file_linux.h" +#endif diff --git a/cc/src/environment/file_common.h b/cc/src/environment/file_common.h new file mode 100644 index 000000000..af8c8fa3b --- /dev/null +++ b/cc/src/environment/file_common.h @@ -0,0 +1,60 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include + +#include "../core/async.h" +#include "../core/lss_allocator.h" + +using namespace FASTER::core; + +namespace FASTER { +namespace environment { + +enum class FileCreateDisposition : uint8_t { + /// Creates the file if it does not exist; truncates it if it does. + CreateOrTruncate, + /// Opens the file if it exists; creates it if it does not. + OpenOrCreate, + /// Opens the file if it exists. + OpenExisting +}; + +inline std::ostream& operator<<(std::ostream& os, FileCreateDisposition val) { + switch(val) { + case FileCreateDisposition::CreateOrTruncate: + os << "CreateOrTruncate"; + break; + case FileCreateDisposition::OpenOrCreate: + os << "OpenOrCreate"; + break; + case FileCreateDisposition::OpenExisting: + os << "OpenExisting"; + break; + default: + os << "UNKNOWN: " << static_cast(val); + break; + } + return os; +} + +enum class FileOperationType : uint8_t { Read, Write }; + +struct FileOptions { + FileOptions() + : unbuffered{ false } + , delete_on_close{ false } { + } + FileOptions(bool unbuffered_, bool delete_on_close_) + : unbuffered{ unbuffered_ } + , delete_on_close{ delete_on_close_ } { + } + + bool unbuffered; + bool delete_on_close; +}; + +} +} // namespace FASTER::environment \ No newline at end of file diff --git a/cc/src/environment/file_linux.cc b/cc/src/environment/file_linux.cc new file mode 100644 index 000000000..2cbcf7b5c --- /dev/null +++ b/cc/src/environment/file_linux.cc @@ -0,0 +1,199 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#include +#include +#include +#include +#include +#include +#include +#include +#include "file_linux.h" + +namespace FASTER { +namespace environment { + +#ifdef _DEBUG +#define DCHECK_ALIGNMENT(o, l, b) \ +do { \ + assert(reinterpret_cast(b) % device_alignment() == 0); \ + assert((o) % device_alignment() == 0); \ + assert((l) % device_alignment() == 0); \ +} while (0) +#else +#define DCHECK_ALIGNMENT(o, l, b) do {} while(0) +#endif + +Status File::Open(int flags, FileCreateDisposition create_disposition, bool* exists) { + if(exists) { + *exists = false; + } + + int create_flags = GetCreateDisposition(create_disposition); + + /// Always unbuffered (O_DIRECT). + fd_ = ::open(filename_.c_str(), flags | O_RDWR | create_flags, S_IRUSR | S_IWUSR); + + if(exists) { + // Let the caller know whether the file we tried to open or create (already) exists. + if(create_disposition == FileCreateDisposition::CreateOrTruncate || + create_disposition == FileCreateDisposition::OpenOrCreate) { + *exists = (errno == EEXIST); + } else if(create_disposition == FileCreateDisposition::OpenExisting) { + *exists = (errno != ENOENT); + if(!*exists) { + // The file doesn't exist. Don't return an error, since the caller is expecting this case. + return Status::Ok; + } + } + } + if(fd_ == -1) { + int error = errno; + return Status::IOError; + } + + Status result = GetDeviceAlignment(); + if(result != Status::Ok) { + Close(); + } + owner_ = true; + return result; +} + +Status File::Close() { + if(fd_ != -1) { + int result = ::close(fd_); + fd_ = -1; + if(result == -1) { + int error = errno; + return Status::IOError; + } + } + owner_ = false; + return Status::Ok; +} + +Status File::Delete() { + int result = ::remove(filename_.c_str()); + if(result == -1) { + int error = errno; + return Status::IOError; + } +} + +Status File::GetDeviceAlignment() { + // For now, just hardcode 512-byte alignment. + device_alignment_ = 512; + return Status::Ok; +} + +int File::GetCreateDisposition(FileCreateDisposition create_disposition) { + switch(create_disposition) { + case FileCreateDisposition::CreateOrTruncate: + return O_CREAT | O_TRUNC; + case FileCreateDisposition::OpenOrCreate: + return O_CREAT; + case FileCreateDisposition::OpenExisting: + return 0; + default: + assert(false); + return 0; // not reached + } +} + +void QueueIoHandler::IoCompletionCallback(io_context_t ctx, struct iocb* iocb, long res, + long res2) { + auto callback_context = make_context_unique_ptr( + reinterpret_cast(iocb)); + size_t bytes_transferred; + Status return_status; + if(res < 0) { + return_status = Status::IOError; + bytes_transferred = 0; + } else { + return_status = Status::Ok; + bytes_transferred = res; + } + callback_context->callback(callback_context->caller_context, return_status, bytes_transferred); +} + +bool QueueIoHandler::TryComplete() { + struct timespec timeout; + std::memset(&timeout, 0, sizeof(timeout)); + struct io_event events[1]; + int result = ::io_getevents(io_object_, 1, 1, events, &timeout); + if(result == 1) { + io_callback_t callback = reinterpret_cast(events[0].data); + callback(io_object_, events[0].obj, events[0].res, events[0].res2); + return true; + } else { + return false; + } +} + +Status QueueFile::Open(FileCreateDisposition create_disposition, const FileOptions& options, + QueueIoHandler* handler, bool* exists) { + int flags = 0; + if(options.unbuffered) { + flags |= O_DIRECT; + } + RETURN_NOT_OK(File::Open(flags, create_disposition, exists)); + if(exists && !*exists) { + return Status::Ok; + } + + io_object_ = handler->io_object(); + return Status::Ok; +} + +Status QueueFile::Read(size_t offset, uint32_t length, uint8_t* buffer, + IAsyncContext& context, AsyncIOCallback callback) const { + DCHECK_ALIGNMENT(offset, length, buffer); +#ifdef IO_STATISTICS + ++read_count_; + bytes_read_ += length; +#endif + return const_cast(this)->ScheduleOperation(FileOperationType::Read, buffer, + offset, length, context, callback); +} + +Status QueueFile::Write(size_t offset, uint32_t length, const uint8_t* buffer, + IAsyncContext& context, AsyncIOCallback callback) { + DCHECK_ALIGNMENT(offset, length, buffer); +#ifdef IO_STATISTICS + bytes_written_ += length; +#endif + return ScheduleOperation(FileOperationType::Write, const_cast(buffer), offset, length, + context, callback); +} + +Status QueueFile::ScheduleOperation(FileOperationType operationType, uint8_t* buffer, + size_t offset, uint32_t length, IAsyncContext& context, + AsyncIOCallback callback) { + auto io_context = alloc_context(sizeof( + QueueIoHandler::IoCallbackContext)); + if(!io_context.get()) return Status::OutOfMemory; + + IAsyncContext* caller_context_copy; + RETURN_NOT_OK(context.DeepCopy(caller_context_copy)); + + new(io_context.get()) QueueIoHandler::IoCallbackContext(operationType, fd_, offset, length, + buffer, caller_context_copy, callback); + + struct iocb* iocbs[1]; + iocbs[0] = reinterpret_cast(io_context.get()); + + int result = ::io_submit(io_object_, 1, iocbs); + if(result != 1) { + return Status::IOError; + } + + io_context.release(); + return Status::Ok; +} + +#undef DCHECK_ALIGNMENT + +} +} // namespace FASTER::environment diff --git a/cc/src/environment/file_linux.h b/cc/src/environment/file_linux.h new file mode 100644 index 000000000..500576e42 --- /dev/null +++ b/cc/src/environment/file_linux.h @@ -0,0 +1,254 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "../core/async.h" +#include "../core/status.h" +#include "file_common.h" + +namespace FASTER { +namespace environment { + +constexpr const char* kPathSeparator = "/"; + +/// The File class encapsulates the OS file handle. +class File { + protected: + File() + : fd_{ -1 } + , device_alignment_{ 0 } + , filename_{} + , owner_{ false } +#ifdef IO_STATISTICS + , bytes_written_ { 0 } + , read_count_{ 0 } + , bytes_read_{ 0 } +#endif + { + } + + File(const std::string& filename) + : fd_{ -1 } + , device_alignment_{ 0 } + , filename_{ filename } + , owner_{ false } +#ifdef IO_STATISTICS + , bytes_written_ { 0 } + , read_count_{ 0 } + , bytes_read_{ 0 } +#endif + { + } + + /// Move constructor. + File(File&& other) + : fd_{ other.fd_ } + , device_alignment_{ other.device_alignment_ } + , filename_{ std::move(other.filename_) } + , owner_{ other.owner_ } +#ifdef IO_STATISTICS + , bytes_written_ { other.bytes_written_ } + , read_count_{ other.read_count_ } + , bytes_read_{ other.bytes_read_ } +#endif + { + other.owner_ = false; + } + + ~File() { + if(owner_) { + Status s = Close(); + } + } + + /// Move assignment operator. + File& operator=(File&& other) { + fd_ = other.fd_; + device_alignment_ = other.device_alignment_; + filename_ = std::move(other.filename_); + owner_ = other.owner_; +#ifdef IO_STATISTICS + bytes_written_ = other.bytes_written_; + read_count_ = other.read_count_; + bytes_read_ = other.bytes_read_; +#endif + other.owner_ = -1; + return *this; + } + + protected: + Status Open(int flags, FileCreateDisposition create_disposition, bool* exists = nullptr); + + public: + Status Close(); + Status Delete(); + + uint64_t size() const { + struct stat stat_buffer; + int result = ::fstat(fd_, &stat_buffer); + return (result == 0) ? stat_buffer.st_size : 0; + } + + size_t device_alignment() const { + return device_alignment_; + } + + const std::string& filename() const { + return filename_; + } + +#ifdef IO_STATISTICS + uint64_t bytes_written() const { + return bytes_written_.load(); + } + uint64_t read_count() const { + return read_count_.load(); + } + uint64_t bytes_read() const { + return bytes_read_.load(); + } +#endif + + private: + Status GetDeviceAlignment(); + static int GetCreateDisposition(FileCreateDisposition create_disposition); + + protected: + int fd_; + + private: + size_t device_alignment_; + std::string filename_; + bool owner_; + +#ifdef IO_STATISTICS + protected: + std::atomic bytes_written_; + std::atomic read_count_; + std::atomic bytes_read_; +#endif +}; + +class QueueFile; + +/// The QueueIoHandler class encapsulates completions for async file I/O, where the completions +/// are put on the AIO completion queue. +class QueueIoHandler { + public: + typedef QueueFile async_file_t; + + private: + constexpr static int kMaxEvents = 128; + + public: + QueueIoHandler() + : io_object_{ 0 } { + } + QueueIoHandler(size_t max_threads) + : io_object_{ 0 } { + int result = ::io_setup(kMaxEvents, &io_object_); + assert(result >= 0); + } + + /// Move constructor + QueueIoHandler(QueueIoHandler&& other) { + io_object_ = other.io_object_; + other.io_object_ = 0; + } + + ~QueueIoHandler() { + if(io_object_ != 0) + ::io_destroy(io_object_); + } + + /// Invoked whenever a Linux AIO completes. + static void IoCompletionCallback(io_context_t ctx, struct iocb* iocb, long res, long res2); + + struct IoCallbackContext { + IoCallbackContext(FileOperationType operation, int fd, size_t offset, uint32_t length, + uint8_t* buffer, IAsyncContext* context_, AsyncIOCallback callback_) + : caller_context{ context_ } + , callback{ callback_ } { + if(FileOperationType::Read == operation) { + ::io_prep_pread(&this->parent_iocb, fd, buffer, length, offset); + } else { + ::io_prep_pwrite(&this->parent_iocb, fd, buffer, length, offset); + } + ::io_set_callback(&this->parent_iocb, IoCompletionCallback); + } + + // WARNING: "parent_iocb" must be the first field in AioCallbackContext. This class is a C-style + // subclass of "struct iocb". + + /// The iocb structure for Linux AIO. + struct iocb parent_iocb; + + /// Caller callback context. + IAsyncContext* caller_context; + + /// The caller's asynchronous callback function + AsyncIOCallback callback; + }; + + inline io_context_t io_object() const { + return io_object_; + } + + /// Try to execute the next IO completion on the queue, if any. + bool TryComplete(); + + private: + /// The Linux AIO context used for IO completions. + io_context_t io_object_; +}; + +/// The QueueFile class encapsulates asynchronous reads and writes, using the specified AIO +/// context. +class QueueFile : public File { + public: + QueueFile() + : File() + , io_object_{ nullptr } { + } + QueueFile(const std::string& filename) + : File(filename) + , io_object_{ nullptr } { + } + /// Move constructor + QueueFile(QueueFile&& other) + : File(std::move(other)) + , io_object_{ other.io_object_ } { + } + /// Move assignment operator. + QueueFile& operator=(QueueFile&& other) { + File::operator=(std::move(other)); + io_object_ = other.io_object_; + return *this; + } + + Status Open(FileCreateDisposition create_disposition, const FileOptions& options, + QueueIoHandler* handler, bool* exists = nullptr); + + Status Read(size_t offset, uint32_t length, uint8_t* buffer, + IAsyncContext& context, AsyncIOCallback callback) const; + Status Write(size_t offset, uint32_t length, const uint8_t* buffer, + IAsyncContext& context, AsyncIOCallback callback); + + private: + Status ScheduleOperation(FileOperationType operationType, uint8_t* buffer, size_t offset, + uint32_t length, IAsyncContext& context, AsyncIOCallback callback); + + io_context_t io_object_; +}; + +} +} // namespace FASTER::environment diff --git a/cc/src/environment/file_windows.cc b/cc/src/environment/file_windows.cc new file mode 100644 index 000000000..a0dd2f975 --- /dev/null +++ b/cc/src/environment/file_windows.cc @@ -0,0 +1,372 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#include +#include +#include +#include "file.h" + +using namespace FASTER::core; + +namespace FASTER { +namespace environment { +std::string FormatWin32AndHRESULT(DWORD win32_result) { + std::stringstream ss; + ss << "Win32(" << win32_result << ") HRESULT(" + << std::showbase << std::uppercase << std::setfill('0') << std::hex + << HRESULT_FROM_WIN32(win32_result) << ")"; + return ss.str(); +} + +#ifdef _DEBUG +#define DCHECK_ALIGNMENT(o, l, b) \ +do { \ + assert(reinterpret_cast(b) % device_alignment() == 0); \ + assert((o) % device_alignment() == 0); \ + assert((l) % device_alignment() == 0); \ +} while (0) +#else +#define DCHECK_ALIGNMENT(o, l, b) do {} while(0) +#endif + +Status File::Open(DWORD flags, FileCreateDisposition create_disposition, bool* exists) { + assert(!filename_.empty()); + if(exists) { + *exists = false; + } + + file_handle_ = ::CreateFileA(filename_.c_str(), GENERIC_READ | GENERIC_WRITE, 0, nullptr, + GetCreateDisposition(create_disposition), flags, nullptr); + if(exists) { + // Let the caller know whether the file we tried to open or create (already) exists. + if(create_disposition == FileCreateDisposition::CreateOrTruncate || + create_disposition == FileCreateDisposition::OpenOrCreate) { + *exists = (::GetLastError() == ERROR_ALREADY_EXISTS); + } else if(create_disposition == FileCreateDisposition::OpenExisting) { + *exists = (::GetLastError() != ERROR_FILE_NOT_FOUND); + if(!*exists) { + // The file doesn't exist. Don't return an error, since the caller is expecting this case. + return Status::Ok; + } + } + } + if(file_handle_ == INVALID_HANDLE_VALUE) { + auto error = ::GetLastError(); + return Status::IOError; + } + + Status result = GetDeviceAlignment(); + if(result != Status::Ok) { + Close(); + } + owner_ = true; + return result; +} + +Status File::Close() { + if(file_handle_ != INVALID_HANDLE_VALUE) { + bool success = ::CloseHandle(file_handle_); + file_handle_ = INVALID_HANDLE_VALUE; + if(!success) { + auto error = ::GetLastError(); + return Status::IOError; + } + } + owner_ = false; + return Status::Ok; +} + +Status File::Delete() { + bool success = ::DeleteFileA(filename_.c_str()); + if(!success) { + auto error = ::GetLastError(); + return Status::IOError; + } + return Status::Ok; +} + +Status File::GetDeviceAlignment() { + FILE_STORAGE_INFO info; + bool result = ::GetFileInformationByHandleEx(file_handle_, + FILE_INFO_BY_HANDLE_CLASS::FileStorageInfo, &info, sizeof(info)); + if(!result) { + auto error = ::GetLastError(); + return Status::IOError; + } + + device_alignment_ = info.LogicalBytesPerSector; + return Status::Ok; +} + +DWORD File::GetCreateDisposition(FileCreateDisposition create_disposition) { + switch(create_disposition) { + case FileCreateDisposition::CreateOrTruncate: + return CREATE_ALWAYS; + case FileCreateDisposition::OpenOrCreate: + return OPEN_ALWAYS; + case FileCreateDisposition::OpenExisting: + return OPEN_EXISTING; + default: + assert(false); + return INVALID_FILE_ATTRIBUTES; // not reached + } +} + +void CALLBACK ThreadPoolIoHandler::IoCompletionCallback(PTP_CALLBACK_INSTANCE instance, + PVOID context, PVOID overlapped, ULONG ioResult, ULONG_PTR bytesTransferred, PTP_IO io) { + // context is always nullptr; state is threaded via the OVERLAPPED + auto callback_context = make_context_unique_ptr( + reinterpret_cast(overlapped)); + + HRESULT hr = HRESULT_FROM_WIN32(ioResult); + Status return_status; + if(FAILED(hr)) { + return_status = Status::IOError; + } else { + return_status = Status::Ok; + } + callback_context->callback(callback_context->caller_context, return_status, + static_cast(bytesTransferred)); +} + +WindowsPtpThreadPool::WindowsPtpThreadPool(size_t max_threads) + : pool_{ nullptr } + , callback_environment_{ nullptr } + , cleanup_group_{ nullptr } + , max_threads_{ max_threads } { + pool_ = ::CreateThreadpool(nullptr); + ::SetThreadpoolThreadMaximum(pool_, static_cast(max_threads)); + bool ret = ::SetThreadpoolThreadMinimum(pool_, 1); + if(!ret) { + throw std::runtime_error{ "Cannot set threadpool thread minimum to 1" }; + } + cleanup_group_ = ::CreateThreadpoolCleanupGroup(); + if(!cleanup_group_) { + throw std::runtime_error{ "Cannot create threadpool cleanup group" }; + } + + callback_environment_ = new TP_CALLBACK_ENVIRON{}; + + ::InitializeThreadpoolEnvironment(callback_environment_); + ::SetThreadpoolCallbackPool(callback_environment_, pool_); + ::SetThreadpoolCallbackPriority(callback_environment_, TP_CALLBACK_PRIORITY_LOW); + ::SetThreadpoolCallbackCleanupGroup(callback_environment_, cleanup_group_, nullptr); +} + +WindowsPtpThreadPool::~WindowsPtpThreadPool() { + if(!cleanup_group_) return; + + // Wait until all callbacks have finished. + ::CloseThreadpoolCleanupGroupMembers(cleanup_group_, FALSE, nullptr); + + ::DestroyThreadpoolEnvironment(callback_environment_); + + ::CloseThreadpoolCleanupGroup(cleanup_group_); + ::CloseThreadpool(pool_); + + delete callback_environment_; +} + +Status WindowsPtpThreadPool::Schedule(Task task, void* task_parameters) { + auto info = alloc_context(sizeof(TaskInfo)); + if(!info.get()) return Status::OutOfMemory; + new(info.get()) TaskInfo(); + + info->task = task; + info->task_parameters = task_parameters; + + PTP_WORK_CALLBACK ptp_callback = TaskStartSpringboard; + PTP_WORK work = CreateThreadpoolWork(ptp_callback, info.get(), callback_environment_); + if(!work) { + std::stringstream ss; + ss << "Failed to schedule work: " << FormatWin32AndHRESULT(::GetLastError()); + fprintf(stderr, "%s\n", ss.str().c_str()); + return Status::Aborted; + } + SubmitThreadpoolWork(work); + info.release(); + + return Status::Ok; +} + +void CALLBACK WindowsPtpThreadPool::TaskStartSpringboard(PTP_CALLBACK_INSTANCE instance, + PVOID parameter, PTP_WORK work) { + auto info = make_context_unique_ptr(reinterpret_cast(parameter)); + info->task(info->task_parameters); + CloseThreadpoolWork(work); +} + +Status ThreadPoolFile::Open(FileCreateDisposition create_disposition, const FileOptions& options, + ThreadPoolIoHandler* handler, bool* exists) { + DWORD flags = FILE_FLAG_RANDOM_ACCESS | FILE_FLAG_OVERLAPPED; + if(options.unbuffered) { + flags |= FILE_FLAG_NO_BUFFERING; + } + RETURN_NOT_OK(File::Open(flags, create_disposition, exists)); + if(exists && !*exists) { + return Status::Ok; + } + + io_object_ = ::CreateThreadpoolIo(file_handle_, handler->IoCompletionCallback, nullptr, + handler->callback_environment()); + if(!io_object_) { + Close(); + return Status::IOError; + } + return Status::Ok; +} + +Status ThreadPoolFile::Read(size_t offset, uint32_t length, uint8_t* buffer, + IAsyncContext& context, AsyncIOCallback callback) const { + DCHECK_ALIGNMENT(offset, length, buffer); +#ifdef IO_STATISTICS + ++read_count_; + bytes_read_ += length; +#endif + return const_cast(this)->ScheduleOperation(FileOperationType::Read, buffer, + offset, length, context, callback); +} + +Status ThreadPoolFile::Write(size_t offset, uint32_t length, const uint8_t* buffer, + IAsyncContext& context, AsyncIOCallback callback) { + DCHECK_ALIGNMENT(offset, length, buffer); +#ifdef IO_STATISTICS + bytes_written_ += length; +#endif + return ScheduleOperation(FileOperationType::Write, const_cast(buffer), offset, length, + context, callback); +} + +Status ThreadPoolFile::ScheduleOperation(FileOperationType operationType, uint8_t* buffer, + size_t offset, uint32_t length, IAsyncContext& context, AsyncIOCallback callback) { + auto io_context = alloc_context(sizeof( + ThreadPoolIoHandler::IoCallbackContext)); + if(!io_context.get()) return Status::OutOfMemory; + + IAsyncContext* caller_context_copy; + RETURN_NOT_OK(context.DeepCopy(caller_context_copy)); + + new(io_context.get()) ThreadPoolIoHandler::IoCallbackContext(offset, caller_context_copy, + callback); + + ::StartThreadpoolIo(io_object_); + + bool success = FALSE; + if(FileOperationType::Read == operationType) { + success = ::ReadFile(file_handle_, buffer, length, nullptr, &io_context->parent_overlapped); + } else { + success = ::WriteFile(file_handle_, buffer, length, nullptr, &io_context->parent_overlapped); + } + if(!success) { + DWORD win32_result = ::GetLastError(); + // Any error other than ERROR_IO_PENDING means the IO failed. Otherwise it will finish + // asynchronously on the threadpool + if(ERROR_IO_PENDING != win32_result) { + ::CancelThreadpoolIo(io_object_); + std::stringstream ss; + ss << "Failed to schedule async IO: " << FormatWin32AndHRESULT(win32_result); + fprintf(stderr, "%s\n", ss.str().c_str()); + return Status::IOError; + } + } + io_context.release(); + return Status::Ok; +} + +bool QueueIoHandler::TryComplete() { + DWORD bytes_transferred; + ULONG_PTR completion_key; + LPOVERLAPPED overlapped = NULL; + bool succeeded = ::GetQueuedCompletionStatus(io_completion_port_, &bytes_transferred, + &completion_key, &overlapped, 0); + if(overlapped) { + Status return_status; + if(!succeeded) { + return_status = Status::IOError; + } else { + return_status = Status::Ok; + } + auto callback_context = make_context_unique_ptr( + reinterpret_cast(overlapped)); + callback_context->callback(callback_context->caller_context, return_status, bytes_transferred); + return true; + } else { + return false; + } +} + +Status QueueFile::Open(FileCreateDisposition create_disposition, const FileOptions& options, + QueueIoHandler* handler, bool* exists) { + DWORD flags = FILE_FLAG_RANDOM_ACCESS | FILE_FLAG_OVERLAPPED; + if(options.unbuffered) { + flags |= FILE_FLAG_NO_BUFFERING; + } + RETURN_NOT_OK(File::Open(flags, create_disposition, exists)); + if(exists && !*exists) { + return Status::Ok; + } + + handler->AssociateFile(file_handle_); + return Status::Ok; +} + +Status QueueFile::Read(size_t offset, uint32_t length, uint8_t* buffer, + IAsyncContext& context, AsyncIOCallback callback) const { + DCHECK_ALIGNMENT(offset, length, buffer); +#ifdef IO_STATISTICS + ++read_count_; + bytes_read_ += length; +#endif + return const_cast(this)->ScheduleOperation(FileOperationType::Read, buffer, + offset, length, context, callback); +} + +Status QueueFile::Write(size_t offset, uint32_t length, const uint8_t* buffer, + IAsyncContext& context, AsyncIOCallback callback) { + DCHECK_ALIGNMENT(offset, length, buffer); +#ifdef IO_STATISTICS + bytes_written_ += length; +#endif + return ScheduleOperation(FileOperationType::Write, const_cast(buffer), offset, length, + context, callback); +} + +Status QueueFile::ScheduleOperation(FileOperationType operationType, uint8_t* buffer, + size_t offset, uint32_t length, IAsyncContext& context, + AsyncIOCallback callback) { + auto io_context = alloc_context(sizeof( + QueueIoHandler::IoCallbackContext)); + if(!io_context.get()) return Status::OutOfMemory; + + IAsyncContext* caller_context_copy; + RETURN_NOT_OK(context.DeepCopy(caller_context_copy)); + + new(io_context.get()) QueueIoHandler::IoCallbackContext(offset, caller_context_copy, + callback); + + bool success = FALSE; + if(FileOperationType::Read == operationType) { + success = ::ReadFile(file_handle_, buffer, length, nullptr, &io_context->parent_overlapped); + } else { + success = ::WriteFile(file_handle_, buffer, length, nullptr, &io_context->parent_overlapped); + } + if(!success) { + DWORD win32_result = ::GetLastError(); + // Any error other than ERROR_IO_PENDING means the IO failed. Otherwise it will finish + // asynchronously on the threadpool + if(ERROR_IO_PENDING != win32_result) { + std::stringstream ss; + ss << "Failed to schedule async IO: " << FormatWin32AndHRESULT(win32_result) << + ", handle " << std::to_string((uint64_t)file_handle_); + fprintf(stderr, "%s\n", ss.str().c_str()); + return Status::IOError; + } + } + io_context.release(); + return Status::Ok; +} + +#undef DCHECK_ALIGNMENT + +} +} // namespace FASTER::environment \ No newline at end of file diff --git a/cc/src/environment/file_windows.h b/cc/src/environment/file_windows.h new file mode 100644 index 000000000..aa1aa5c8e --- /dev/null +++ b/cc/src/environment/file_windows.h @@ -0,0 +1,415 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#ifdef _WIN32 +#define NOMINMAX +#define _WINSOCKAPI_ +#include +#endif + +#include +#include +#include + +#include "../core/async.h" +#include "../core/status.h" +#include "file_common.h" + +/// Windows file routines. + +namespace FASTER { +namespace environment { +constexpr const char* kPathSeparator = "\\"; + +/// The File class encapsulates the OS file handle. +class File { + protected: + File() + : file_handle_{ INVALID_HANDLE_VALUE } + , device_alignment_{ 0 } + , filename_{} + , owner_{ false } +#ifdef IO_STATISTICS + , bytes_written_ { 0 } + , read_count_{ 0 } + , bytes_read_{ 0 } +#endif + { + } + + File(const std::string& filename) + : file_handle_{ INVALID_HANDLE_VALUE } + , device_alignment_{ 0 } + , filename_{ filename } + , owner_{ false } +#ifdef IO_STATISTICS + , bytes_written_ { 0 } + , read_count_{ 0 } + , bytes_read_{ 0 } +#endif + { + } + + /// Move constructor. + File(File&& other) + : file_handle_{ other.file_handle_ } + , device_alignment_{ other.device_alignment_ } + , filename_{ std::move(other.filename_) } + , owner_{ other.owner_ } +#ifdef IO_STATISTICS + , bytes_written_ { other.bytes_written_ } + , read_count_{ other.read_count_ } + , bytes_read_{ other.bytes_read_ } +#endif + { + other.owner_ = false; + } + + ~File() { + if(owner_) { + Status s = Close(); + } + } + + /// Move assignment operator. + File& operator=(File&& other) { + file_handle_ = other.file_handle_; + device_alignment_ = other.device_alignment_; + filename_ = std::move(other.filename_); + owner_ = other.owner_; +#ifdef IO_STATISTICS + bytes_written_ = other.bytes_written_; + read_count_ = other.read_count_; + bytes_read_ = other.bytes_read_; +#endif + other.owner_ = false; + return *this; + } + + protected: + Status Open(DWORD flags, FileCreateDisposition create_disposition, bool* exists = nullptr); + + public: + Status Close(); + Status Delete(); + + uint64_t size() const { + LARGE_INTEGER file_size; + auto result = ::GetFileSizeEx(file_handle_, &file_size); + return result ? file_size.QuadPart : 0; + } + + size_t device_alignment() const { + return device_alignment_; + } + + const std::string& filename() const { + return filename_; + } + +#ifdef IO_STATISTICS + uint64_t bytes_written() const { + return bytes_written_.load(); + } + uint64_t read_count() const { + return read_count_.load(); + } + uint64_t bytes_read() const { + return bytes_read_.load(); + } +#endif + + private: + Status GetDeviceAlignment(); + static DWORD GetCreateDisposition(FileCreateDisposition create_disposition); + + protected: + HANDLE file_handle_; + + private: + size_t device_alignment_; + std::string filename_; + bool owner_; + +#ifdef IO_STATISTICS + protected: + std::atomic bytes_written_; + std::atomic read_count_; + std::atomic bytes_read_; +#endif +}; + +class WindowsPtpThreadPool { + public: + typedef void(*Task)(void* arguments); + + WindowsPtpThreadPool() + : pool_{ nullptr } + , callback_environment_{ nullptr } + , cleanup_group_{ nullptr } + , max_threads_{ 0 } { + } + + WindowsPtpThreadPool(size_t max_threads); + + /// Move constructor + WindowsPtpThreadPool(WindowsPtpThreadPool&& other) + : pool_{ other.pool_ } + , callback_environment_{ other.callback_environment_ } + , cleanup_group_{ other.cleanup_group_ } + , max_threads_{ other.max_threads_ } { + other.pool_ = nullptr; + other.callback_environment_ = nullptr; + other.cleanup_group_ = nullptr; + other.max_threads_ = 0; + } + + ~WindowsPtpThreadPool(); + + Status Schedule(Task task, void* task_argument); + + PTP_CALLBACK_ENVIRON callback_environment() { + return callback_environment_; + } + + private: + /// Describes a task that should be invoked. Created and enqueued in ScheduleTask(); dispatched + /// and freed in TaskStartSpringboard(). + struct TaskInfo { + TaskInfo() + : task{} + , task_parameters{} { + } + + /// The task to be invoked when the work item is issued by the pool. + Task task; + + /// Argument passed into #m_task when it is called. + void* task_parameters; + }; + + /// Called asynchronously by a thread from #m_pool whenever the thread pool starts to execute a + /// task scheduled via ScheduleTask(). Just determines which routine was requested for execution + /// and calls it. + static void CALLBACK TaskStartSpringboard(PTP_CALLBACK_INSTANCE instance, PVOID parameter, + PTP_WORK work); + + /// A Window Thread Pool object that is used to run asynchronous IO + /// operations (and callbacks) and other tasks (scheduled via + /// ScheduleTask()). + PTP_POOL pool_; + + /// An environment that associates Windows Thread Pool IO and Task objects + /// to #m_pool. AsyncIOFileWrappers and scheduled tasks are associated + /// with this environments to schedule them for execution. + PTP_CALLBACK_ENVIRON callback_environment_; + + /// The cleanup group associated with all environments and the thread pool. + PTP_CLEANUP_GROUP cleanup_group_; + + /// Maximum number of threads the thread pool should allocate. + uint64_t max_threads_; +}; + +class ThreadPoolFile; +class QueueFile; + +/// The ThreadPoolIoHandler class encapsulates completions for async file I/O, scheduled on a +/// thread pool. +class ThreadPoolIoHandler { + public: + typedef ThreadPoolFile async_file_t; + + ThreadPoolIoHandler() + : threadpool_{} { + } + + ThreadPoolIoHandler(size_t max_threads) + : threadpool_{ max_threads } { + } + + /// Move constructor. + ThreadPoolIoHandler(ThreadPoolIoHandler&& other) + : threadpool_{ std::move(other.threadpool_) } { + } + + /// Invoked whenever an asynchronous IO completes; needed because Windows asynchronous IOs are + /// tied to a specific TP_IO object. As a result, we allocate pointers for a per-operation + /// callback along with its OVERLAPPED structure. This allows us to call a specific function in + /// response to each IO, without having to create a TP_IO for each of them. + static void CALLBACK IoCompletionCallback(PTP_CALLBACK_INSTANCE instance, PVOID context, + PVOID overlapped, ULONG ioResult, ULONG_PTR bytesTransferred, PTP_IO io); + + PTP_CALLBACK_ENVIRON callback_environment() { + return threadpool_.callback_environment(); + } + + struct IoCallbackContext { + IoCallbackContext(size_t offset, IAsyncContext* context_, AsyncIOCallback callback_) + : caller_context{ context_ } + , callback{ callback_ } { + ::memset(&parent_overlapped, 0, sizeof(parent_overlapped)); + parent_overlapped.Offset = offset & 0xffffffffllu; + parent_overlapped.OffsetHigh = offset >> 32; + } + + // WARNING: parent_overlapped must be the first field in IOCallbackContext. This class is a + // C-style subclass of "OVERLAPPED". + + /// The overlapped structure for Windows IO + OVERLAPPED parent_overlapped; + /// Caller callback context. + IAsyncContext* caller_context; + /// The caller's asynchronous callback function + AsyncIOCallback callback; + }; + + inline static constexpr bool TryComplete() { + return false; + } + + private: + /// The parent threadpool. + WindowsPtpThreadPool threadpool_; +}; + +/// The QueueIoHandler class encapsulates completions for async file I/O, where the completions +/// are put on a completion port's queue. +class QueueIoHandler { + public: + typedef QueueFile async_file_t; + + QueueIoHandler() + : io_completion_port_{ INVALID_HANDLE_VALUE } { + } + QueueIoHandler(size_t max_threads) + : io_completion_port_{ 0 } { + io_completion_port_ = ::CreateIoCompletionPort(INVALID_HANDLE_VALUE, NULL, 0, + (DWORD)Thread::kMaxNumThreads); + } + + /// Move constructor + QueueIoHandler(QueueIoHandler&& other) + : io_completion_port_{ other.io_completion_port_ } { + other.io_completion_port_ = INVALID_HANDLE_VALUE; + } + + ~QueueIoHandler() { + if(io_completion_port_ != INVALID_HANDLE_VALUE) { + ::CloseHandle(io_completion_port_); + } + } + + inline void AssociateFile(HANDLE file_handle) { + assert(io_completion_port_ != 0); + ::CreateIoCompletionPort(file_handle, io_completion_port_, + reinterpret_cast(file_handle), 0); + } + + struct IoCallbackContext { + IoCallbackContext(size_t offset, IAsyncContext* context_, AsyncIOCallback callback_) + : caller_context{ context_ } + , callback{ callback_ } { + ::memset(&parent_overlapped, 0, sizeof(parent_overlapped)); + parent_overlapped.Offset = offset & 0xffffffffllu; + parent_overlapped.OffsetHigh = offset >> 32; + } + + // WARNING: parent_overlapped must be the first field in IOCallbackContext. This class is a + // C-style subclass of "OVERLAPPED". + + /// The overlapped structure for Windows IO + OVERLAPPED parent_overlapped; + /// Caller callback context. + IAsyncContext* caller_context; + /// The caller's asynchronous callback function + AsyncIOCallback callback; + }; + + bool TryComplete(); + + private: + /// The completion port to whose queue completions are added. + HANDLE io_completion_port_; +}; + +/// The ThreadPoolFile class encapsulates asynchronous reads and writes, where the OS schedules the +/// IO completion on a thread pool. +class ThreadPoolFile : public File { + public: + ThreadPoolFile() + : File() + , io_object_{ nullptr } { + } + + ThreadPoolFile(const std::string& filename) + : File(filename) + , io_object_{ nullptr } { + } + + /// Move constructor + ThreadPoolFile(ThreadPoolFile&& other) + : File(std::move(other)) + , io_object_{ other.io_object_} { + } + + /// Move assignment operator. + ThreadPoolFile& operator=(ThreadPoolFile&& other) { + File::operator=(std::move(other)); + io_object_ = other.io_object_; + return *this; + } + + Status Open(FileCreateDisposition create_disposition, const FileOptions& options, + ThreadPoolIoHandler* handler, bool* exists = nullptr); + + Status Read(size_t offset, uint32_t length, uint8_t* buffer, + IAsyncContext& context, AsyncIOCallback callback) const; + Status Write(size_t offset, uint32_t length, const uint8_t* buffer, + IAsyncContext& context, AsyncIOCallback callback); + + private: + Status ScheduleOperation(FileOperationType operationType, uint8_t* buffer, size_t offset, + uint32_t length, IAsyncContext& context, AsyncIOCallback callback); + + PTP_IO io_object_; +}; + +/// The QueueFile class encapsulates asynchronous reads and writes, where the IO completions are +/// placed on the completion port's queue. +class QueueFile : public File { + public: + QueueFile() + : File() { + } + QueueFile(const std::string& filename) + : File(filename) { + } + /// Move constructor + QueueFile(QueueFile&& other) + : File(std::move(other)) { + } + + /// Move assignment operator. + QueueFile& operator=(QueueFile&& other) { + File::operator=(std::move(other)); + return *this; + } + + Status Open(FileCreateDisposition create_disposition, const FileOptions& options, + QueueIoHandler* handler, bool* exists = nullptr); + + Status Read(size_t offset, uint32_t length, uint8_t* buffer, + IAsyncContext& context, AsyncIOCallback callback) const; + Status Write(size_t offset, uint32_t length, const uint8_t* buffer, + IAsyncContext& context, AsyncIOCallback callback); + + private: + Status ScheduleOperation(FileOperationType operationType, uint8_t* buffer, size_t offset, + uint32_t length, IAsyncContext& context, AsyncIOCallback callback); +}; + +} +} // namespace FASTER::environment \ No newline at end of file diff --git a/cc/test/CMakeLists.txt b/cc/test/CMakeLists.txt new file mode 100644 index 000000000..0b28e4a30 --- /dev/null +++ b/cc/test/CMakeLists.txt @@ -0,0 +1,11 @@ +ADD_FAST_TEST(in_memory_test "") +ADD_FAST_TEST(malloc_fixed_page_size_test "") +ADD_FAST_TEST(paging_queue_test "paging_test.h") +if(MSVC) +ADD_FAST_TEST(paging_threadpool_test "paging_test.h") +endif() +ADD_FAST_TEST(recovery_queue_test "recovery_test.h") +if(MSVC) +ADD_FAST_TEST(recovery_threadpool_test "recovery_test.h") +endif() +ADD_FAST_TEST(utility_test "") diff --git a/cc/test/in_memory_test.cc b/cc/test/in_memory_test.cc new file mode 100644 index 000000000..85d6547f3 --- /dev/null +++ b/cc/test/in_memory_test.cc @@ -0,0 +1,1912 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#include +#include +#include +#include +#include +#include "gtest/gtest.h" + +#include "core/faster.h" +#include "device/null_disk.h" + +using namespace FASTER::core; +TEST(InMemFaster, UpsertRead) { + class alignas(2) Key { + public: + Key(uint8_t key) + : key_{ key } { + } + + inline static constexpr uint32_t size() { + return static_cast(sizeof(Key)); + } + inline KeyHash GetHash() const { + std::hash hash_fn; + return KeyHash{ hash_fn(key_) }; + } + + /// Comparison operators. + inline bool operator==(const Key& other) const { + return key_ == other.key_; + } + inline bool operator!=(const Key& other) const { + return key_ != other.key_; + } + + private: + uint8_t key_; + }; + + class UpsertContext; + class ReadContext; + + class Value { + public: + Value() + : value_{ 0 } { + } + Value(const Value& other) + : value_{ other.value_ } { + } + Value(uint8_t value) + : value_{ value } { + } + + inline static constexpr uint32_t size() { + return static_cast(sizeof(Value)); + } + + friend class UpsertContext; + friend class ReadContext; + + private: + union { + uint8_t value_; + std::atomic atomic_value_; + }; + }; + + class UpsertContext : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + UpsertContext(uint8_t key) + : key_{ key } { + } + + /// Copy (and deep-copy) constructor. + UpsertContext(const UpsertContext& other) + : key_{ other.key_ } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + inline static constexpr uint32_t value_size() { + return sizeof(value_t); + } + /// Non-atomic and atomic Put() methods. + inline void Put(Value& value) { + value.value_ = 23; + } + inline bool PutAtomic(Value& value) { + value.atomic_value_.store(42); + return true; + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + }; + + class ReadContext : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + ReadContext(uint8_t key) + : key_{ key } { + } + + /// Copy (and deep-copy) constructor. + ReadContext(const ReadContext& other) + : key_{ other.key_ } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + + inline void Get(const Value& value) { + // All reads should be atomic (from the mutable tail). + ASSERT_TRUE(false); + } + inline void GetAtomic(const Value& value) { + output = value.atomic_value_.load(); + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + public: + uint8_t output; + }; + + FasterKv store { 128, 1073741824, "" }; + + store.StartSession(); + + // Insert. + for(size_t idx = 0; idx < 256; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + // In-memory test. + ASSERT_TRUE(false); + }; + UpsertContext context{ static_cast(idx) }; + Status result = store.Upsert(context, callback, 1); + ASSERT_EQ(Status::Ok, result); + } + // Read. + for(size_t idx = 0; idx < 256; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + // In-memory test. + ASSERT_TRUE(false); + }; + ReadContext context{ static_cast(idx) }; + Status result = store.Read(context, callback, 1); + ASSERT_EQ(Status::Ok, result); + // All upserts should have inserts (non-atomic). + ASSERT_EQ(23, context.output); + } + // Update. + for(size_t idx = 0; idx < 256; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + // In-memory test. + ASSERT_TRUE(false); + }; + UpsertContext context{ static_cast(idx) }; + Status result = store.Upsert(context, callback, 1); + ASSERT_EQ(Status::Ok, result); + } + // Read again. + for(size_t idx = 0; idx < 256; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + // In-memory test. + ASSERT_TRUE(false); + }; + ReadContext context{ static_cast(idx) }; + Status result = store.Read(context, callback, 1); + ASSERT_EQ(Status::Ok, result); + // All upserts should have updates (atomic). + ASSERT_EQ(42, context.output); + } + + store.StopSession(); +} + +/// The hash always returns "0," so the FASTER store devolves into a linked list. +TEST(InMemFaster, UpsertRead_DummyHash) { + class UpsertContext; + class ReadContext; + + class Key { + public: + Key(uint16_t key) + : key_{ key } { + } + + inline static constexpr uint32_t size() { + return static_cast(sizeof(Key)); + } + inline KeyHash GetHash() const { + return KeyHash{ 42 }; + } + + /// Comparison operators. + inline bool operator==(const Key& other) const { + return key_ == other.key_; + } + inline bool operator!=(const Key& other) const { + return key_ != other.key_; + } + + friend class UpsertContext; + friend class ReadContext; + + private: + uint16_t key_; + }; + + class Value { + public: + Value() + : value_{ 0 } { + } + Value(const Value& other) + : value_{ other.value_ } { + } + Value(uint16_t value) + : value_{ value } { + } + + inline static constexpr uint32_t size() { + return static_cast(sizeof(Value)); + } + + friend class UpsertContext; + friend class ReadContext; + + private: + union { + uint16_t value_; + std::atomic atomic_value_; + }; + }; + + class UpsertContext : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + UpsertContext(uint16_t key) + : key_{ key } { + } + + /// Copy (and deep-copy) constructor. + UpsertContext(const UpsertContext& other) + : key_{ other.key_ } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + inline static constexpr uint32_t value_size() { + return sizeof(value_t); + } + /// Non-atomic and atomic Put() methods. + inline void Put(Value& value) { + value.value_ = key_.key_; + } + inline bool PutAtomic(Value& value) { + value.atomic_value_.store(key_.key_); + return true; + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + }; + + class ReadContext : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + ReadContext(uint16_t key) + : key_{ key } { + } + + /// Copy (and deep-copy) constructor. + ReadContext(const ReadContext& other) + : key_{ other.key_ } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + + inline void Get(const Value& value) { + // All reads should be atomic (from the mutable tail). + ASSERT_TRUE(false); + } + inline void GetAtomic(const Value& value) { + output = value.atomic_value_.load(); + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + public: + uint16_t output; + }; + + FasterKv store{ 128, 1073741824, "" }; + + store.StartSession(); + + // Insert. + for(uint16_t idx = 0; idx < 10000; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + // In-memory test. + ASSERT_TRUE(false); + }; + UpsertContext context{ idx }; + Status result = store.Upsert(context, callback, 1); + ASSERT_EQ(Status::Ok, result); + } + // Read. + for(uint16_t idx = 0; idx < 10000; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + // In-memory test. + ASSERT_TRUE(false); + }; + ReadContext context{ idx }; + Status result = store.Read(context, callback, 1); + ASSERT_EQ(Status::Ok, result); + // All upserts should have inserts (non-atomic). + ASSERT_EQ(idx, context.output); + } + + store.StopSession(); +} + +TEST(InMemFaster, UpsertRead_Concurrent) { + class Key { + public: + Key(uint32_t key) + : key_{ key } { + } + + inline static constexpr uint32_t size() { + return static_cast(sizeof(Key)); + } + inline KeyHash GetHash() const { + std::hash hash_fn; + return KeyHash{ hash_fn(key_) }; + } + + /// Comparison operators. + inline bool operator==(const Key& other) const { + return key_ == other.key_; + } + inline bool operator!=(const Key& other) const { + return key_ != other.key_; + } + + private: + uint32_t key_; + }; + + class UpsertContext; + class ReadContext; + + class alignas(16) Value { + public: + Value() + : length_{ 0 } + , value_{ 0 } { + } + + inline static constexpr uint32_t size() { + return static_cast(sizeof(Value)); + } + + friend class UpsertContext; + friend class ReadContext; + + private: + uint8_t value_[31]; + std::atomic length_; + }; + + class UpsertContext : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + UpsertContext(uint32_t key) + : key_{ key } { + } + + /// Copy (and deep-copy) constructor. + UpsertContext(const UpsertContext& other) + : key_{ other.key_ } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + inline static constexpr uint32_t value_size() { + return sizeof(value_t); + } + /// Non-atomic and atomic Put() methods. + inline void Put(Value& value) { + value.length_ = 5; + std::memset(value.value_, 23, 5); + } + inline bool PutAtomic(Value& value) { + // Get the lock on the value. + bool success; + do { + uint8_t expected_length; + do { + // Spin until other the thread releases the lock. + expected_length = value.length_.load(); + } while(expected_length == UINT8_MAX); + // Try to get the lock. + success = value.length_.compare_exchange_weak(expected_length, UINT8_MAX); + } while(!success); + + std::memset(value.value_, 42, 7); + value.length_.store(7); + return true; + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + }; + + class ReadContext : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + ReadContext(uint32_t key) + : key_{ key } { + } + + /// Copy (and deep-copy) constructor. + ReadContext(const ReadContext& other) + : key_{ other.key_ } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + + inline void Get(const Value& value) { + // All reads should be atomic (from the mutable tail). + ASSERT_TRUE(false); + } + inline void GetAtomic(const Value& value) { + do { + output_length = value.length_.load(); + ASSERT_EQ(0, reinterpret_cast(value.value_) % 16); + output_pt1 = *reinterpret_cast(value.value_); + output_pt2 = *reinterpret_cast(value.value_ + 8); + } while(output_length != value.length_.load()); + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + public: + uint8_t output_length; + uint64_t output_pt1; + uint64_t output_pt2; + }; + + static constexpr size_t kNumOps = 1024; + static constexpr size_t kNumThreads = 8; + + auto upsert_worker = [](FasterKv* store_, + size_t thread_idx) { + store_->StartSession(); + + for(size_t idx = 0; idx < kNumOps; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + // In-memory test. + ASSERT_TRUE(false); + }; + UpsertContext context{ static_cast((thread_idx * kNumOps) + idx) }; + Status result = store_->Upsert(context, callback, 1); + ASSERT_EQ(Status::Ok, result); + } + + store_->StopSession(); + }; + + auto read_worker = [](FasterKv* store_, + size_t thread_idx, uint64_t expected_value) { + store_->StartSession(); + + for(size_t idx = 0; idx < kNumOps; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + // In-memory test. + ASSERT_TRUE(false); + }; + ReadContext context{ static_cast((thread_idx * kNumOps) + idx) }; + Status result = store_->Read(context, callback, 1); + ASSERT_EQ(Status::Ok, result); + ASSERT_EQ(expected_value, context.output_pt1); + } + + store_->StopSession(); + }; + + FasterKv store{ 128, 1073741824, "" }; + + // Insert. + std::deque threads{}; + for(size_t idx = 0; idx < kNumThreads; ++idx) { + threads.emplace_back(upsert_worker, &store, idx); + } + for(auto& thread : threads) { + thread.join(); + } + + // Read. + threads.clear(); + for(size_t idx = 0; idx < kNumThreads; ++idx) { + threads.emplace_back(read_worker, &store, idx, 0x1717171717); + } + for(auto& thread : threads) { + thread.join(); + } + + // Update. + threads.clear(); + for(size_t idx = 0; idx < kNumThreads; ++idx) { + threads.emplace_back(upsert_worker, &store, idx); + } + for(auto& thread : threads) { + thread.join(); + } + + // Read again. + threads.clear(); + for(size_t idx = 0; idx < kNumThreads; ++idx) { + threads.emplace_back(read_worker, &store, idx, 0x2a2a2a2a2a2a2a); + } + for(auto& thread : threads) { + thread.join(); + } +} + +TEST(InMemFaster, UpsertRead_ResizeValue_Concurrent) { + class Key { + public: + Key(uint32_t key) + : key_{ key } { + } + + inline static constexpr uint32_t size() { + return static_cast(sizeof(Key)); + } + inline KeyHash GetHash() const { + std::hash hash_fn; + return KeyHash{ hash_fn(key_) }; + } + + /// Comparison operators. + inline bool operator==(const Key& other) const { + return key_ == other.key_; + } + inline bool operator!=(const Key& other) const { + return key_ != other.key_; + } + + private: + uint32_t key_; + }; + + class UpsertContext; + class ReadContext; + + class GenLock { + public: + GenLock() + : control_{ 0 } { + } + GenLock(uint64_t control) + : control_{ control } { + } + inline GenLock& operator=(const GenLock& other) { + control_ = other.control_; + return *this; + } + + union { + struct { + uint64_t gen_number : 62; + uint64_t locked : 1; + uint64_t replaced : 1; + }; + uint64_t control_; + }; + }; + static_assert(sizeof(GenLock) == 8, "sizeof(GenLock) != 8"); + + class AtomicGenLock { + public: + AtomicGenLock() + : control_{ 0 } { + } + AtomicGenLock(uint64_t control) + : control_{ control } { + } + + inline GenLock load() const { + return GenLock{ control_.load() }; + } + inline void store(GenLock desired) { + control_.store(desired.control_); + } + + inline bool try_lock(bool& replaced) { + replaced = false; + GenLock expected{ control_.load() }; + expected.locked = 0; + expected.replaced = 0; + GenLock desired{ expected.control_ }; + desired.locked = 1; + + if(control_.compare_exchange_strong(expected.control_, desired.control_)) { + return true; + } + if(expected.replaced) { + replaced = true; + } + return false; + } + inline void unlock(bool replaced) { + if(replaced) { + // Just turn off "locked" bit and increase gen number. + uint64_t sub_delta = ((uint64_t)1 << 62) - 1; + control_.fetch_sub(sub_delta); + } else { + // Turn off "locked" bit, turn on "replaced" bit, and increase gen number + uint64_t add_delta = ((uint64_t)1 << 63) - ((uint64_t)1 << 62) + 1; + control_.fetch_add(add_delta); + } + } + + private: + std::atomic control_; + }; + static_assert(sizeof(AtomicGenLock) == 8, "sizeof(AtomicGenLock) != 8"); + + class Value { + public: + Value() + : gen_lock_{ 0 } + , size_{ 0 } + , length_{ 0 } { + } + + inline uint32_t size() const { + return size_; + } + + friend class UpsertContext; + friend class ReadContext; + + private: + AtomicGenLock gen_lock_; + uint32_t size_; + uint32_t length_; + + inline const uint8_t* buffer() const { + return reinterpret_cast(this + 1); + } + inline uint8_t* buffer() { + return reinterpret_cast(this + 1); + } + }; + + class UpsertContext : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + UpsertContext(uint32_t key, uint32_t length) + : key_{ key } + , length_{ length } { + } + + /// Copy (and deep-copy) constructor. + UpsertContext(const UpsertContext& other) + : key_{ other.key_ } + , length_{ other.length_ } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + inline uint32_t value_size() const { + return sizeof(Value) + length_; + } + /// Non-atomic and atomic Put() methods. + inline void Put(Value& value) { + value.gen_lock_.store(0); + value.size_ = sizeof(Value) + length_; + value.length_ = length_; + std::memset(value.buffer(), 88, length_); + } + inline bool PutAtomic(Value& value) { + bool replaced; + while(!value.gen_lock_.try_lock(replaced) && !replaced) { + std::this_thread::yield(); + } + if(replaced) { + // Some other thread replaced this record. + return false; + } + if(value.size_ < sizeof(Value) + length_) { + // Current value is too small for in-place update. + value.gen_lock_.unlock(true); + return false; + } + // In-place update overwrites length and buffer, but not size. + value.length_ = length_; + std::memset(value.buffer(), 88, length_); + value.gen_lock_.unlock(false); + return true; + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + uint32_t length_; + }; + + class ReadContext : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + ReadContext(uint32_t key) + : key_{ key } + , output_length{ 0 } { + } + + /// Copy (and deep-copy) constructor. + ReadContext(const ReadContext& other) + : key_{ other.key_ } + , output_length{ 0 } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + + inline void Get(const Value& value) { + // All reads should be atomic (from the mutable tail). + ASSERT_TRUE(false); + } + inline void GetAtomic(const Value& value) { + GenLock before, after; + do { + before = value.gen_lock_.load(); + output_length = value.length_; + output_bytes[0] = value.buffer()[0]; + output_bytes[1] = value.buffer()[value.length_ - 1]; + after = value.gen_lock_.load(); + } while(before.gen_number != after.gen_number); + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + public: + uint8_t output_length; + // Extract two bytes of output. + uint8_t output_bytes[2]; + }; + + static constexpr size_t kNumOps = 1024; + static constexpr size_t kNumThreads = 8; + + auto upsert_worker = [](FasterKv* store_, + size_t thread_idx, uint32_t value_length) { + store_->StartSession(); + + for(size_t idx = 0; idx < kNumOps; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + // In-memory test. + ASSERT_TRUE(false); + }; + UpsertContext context{ static_cast((thread_idx * kNumOps) + idx), value_length }; + Status result = store_->Upsert(context, callback, 1); + ASSERT_EQ(Status::Ok, result); + } + + store_->StopSession(); + }; + + auto read_worker = [](FasterKv* store_, + size_t thread_idx, uint8_t expected_value) { + store_->StartSession(); + + for(size_t idx = 0; idx < kNumOps; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + // In-memory test. + ASSERT_TRUE(false); + }; + ReadContext context{ static_cast((thread_idx * kNumOps) + idx) }; + Status result = store_->Read(context, callback, 1); + ASSERT_EQ(Status::Ok, result); + ASSERT_EQ(expected_value, context.output_bytes[0]); + ASSERT_EQ(expected_value, context.output_bytes[1]); + } + + store_->StopSession(); + }; + + FasterKv store{ 128, 1073741824, "" }; + + // Insert. + std::deque threads{}; + for(size_t idx = 0; idx < kNumThreads; ++idx) { + threads.emplace_back(upsert_worker, &store, idx, 7); + } + for(auto& thread : threads) { + thread.join(); + } + + // Read. + threads.clear(); + for(size_t idx = 0; idx < kNumThreads; ++idx) { + threads.emplace_back(read_worker, &store, idx, 88); + } + for(auto& thread : threads) { + thread.join(); + } + + // Update. + threads.clear(); + for(size_t idx = 0; idx < kNumThreads; ++idx) { + threads.emplace_back(upsert_worker, &store, idx, 11); + } + for(auto& thread : threads) { + thread.join(); + } + + // Read again. + threads.clear(); + for(size_t idx = 0; idx < kNumThreads; ++idx) { + threads.emplace_back(read_worker, &store, idx, 88); + } + for(auto& thread : threads) { + thread.join(); + } +} +TEST(InMemFaster, Rmw) { + class Key { + public: + Key(uint64_t key) + : key_{ key } { + } + + inline static constexpr uint32_t size() { + return static_cast(sizeof(Key)); + } + inline KeyHash GetHash() const { + std::hash hash_fn; + return KeyHash{ hash_fn(key_) }; + } + + /// Comparison operators. + inline bool operator==(const Key& other) const { + return key_ == other.key_; + } + inline bool operator!=(const Key& other) const { + return key_ != other.key_; + } + + private: + uint64_t key_; + }; + + class RmwContext; + class ReadContext; + + class Value { + public: + Value() + : value_{ 0 } { + } + Value(const Value& other) + : value_{ other.value_ } { + } + + inline static constexpr uint32_t size() { + return static_cast(sizeof(Value)); + } + + friend class RmwContext; + friend class ReadContext; + + private: + union { + int32_t value_; + std::atomic atomic_value_; + }; + }; + + class RmwContext : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + RmwContext(uint64_t key, int32_t incr) + : key_{ key } + , incr_{ incr } { + } + + /// Copy (and deep-copy) constructor. + RmwContext(const RmwContext& other) + : key_{ other.key_ } + , incr_{ other.incr_ } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + inline static constexpr uint32_t value_size() { + return sizeof(value_t); + } + inline void RmwInitial(Value& value) { + value.value_ = incr_; + } + inline void RmwCopy(const Value& old_value, Value& value) { + value.value_ = old_value.value_ + incr_; + } + inline bool RmwAtomic(Value& value) { + value.atomic_value_.fetch_add(incr_); + return true; + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + int32_t incr_; + Key key_; + }; + + class ReadContext : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + ReadContext(uint64_t key) + : key_{ key } { + } + + /// Copy (and deep-copy) constructor. + ReadContext(const ReadContext& other) + : key_{ other.key_ } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + + inline void Get(const Value& value) { + // All reads should be atomic (from the mutable tail). + ASSERT_TRUE(false); + } + inline void GetAtomic(const Value& value) { + output = value.atomic_value_.load(); + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + public: + int32_t output; + }; + + FasterKv store{ 256, 1073741824, "" }; + + store.StartSession(); + + // Rmw, increment by 1. + for(size_t idx = 0; idx < 2048; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + // In-memory test. + ASSERT_TRUE(false); + }; + RmwContext context{ idx % 512, 1 }; + Status result = store.Rmw(context, callback, 1); + ASSERT_EQ(Status::Ok, result); + } + // Read. + for(size_t idx = 0; idx < 512; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + // In-memory test. + ASSERT_TRUE(false); + }; + ReadContext context{ idx }; + Status result = store.Read(context, callback, 1); + ASSERT_EQ(Status::Ok, result) << idx; + // Should have performed 4 RMWs. + ASSERT_EQ(4, context.output); + } + // Rmw, decrement by 1. + for(size_t idx = 0; idx < 2048; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + // In-memory test. + ASSERT_TRUE(false); + }; + RmwContext context{ idx % 512, -1 }; + Status result = store.Rmw(context, callback, 1); + ASSERT_EQ(Status::Ok, result); + } + // Read again. + for(size_t idx = 0; idx < 512; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + // In-memory test. + ASSERT_TRUE(false); + }; + ReadContext context{ static_cast(idx) }; + Status result = store.Read(context, callback, 1); + ASSERT_EQ(Status::Ok, result); + // All upserts should have inserts (non-atomic). + ASSERT_EQ(0, context.output); + } + + store.StopSession(); +} + +TEST(InMemFaster, Rmw_Concurrent) { + class Key { + public: + Key(uint64_t key) + : key_{ key } { + } + + inline static constexpr uint32_t size() { + return static_cast(sizeof(Key)); + } + inline KeyHash GetHash() const { + std::hash hash_fn; + return KeyHash{ hash_fn(key_) }; + } + + /// Comparison operators. + inline bool operator==(const Key& other) const { + return key_ == other.key_; + } + inline bool operator!=(const Key& other) const { + return key_ != other.key_; + } + + private: + uint64_t key_; + }; + + class RmwContext; + class ReadContext; + + class Value { + public: + Value() + : value_{ 0 } { + } + Value(const Value& other) + : value_{ other.value_ } { + } + + inline static constexpr uint32_t size() { + return static_cast(sizeof(Value)); + } + + friend class RmwContext; + friend class ReadContext; + + private: + union { + int64_t value_; + std::atomic atomic_value_; + }; + }; + + class RmwContext : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + RmwContext(uint64_t key, int64_t incr) + : key_{ key } + , incr_{ incr } { + } + + /// Copy (and deep-copy) constructor. + RmwContext(const RmwContext& other) + : key_{ other.key_ } + , incr_{ other.incr_ } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + inline static constexpr uint32_t value_size() { + return sizeof(value_t); + } + + inline void RmwInitial(Value& value) { + value.value_ = incr_; + } + inline void RmwCopy(const Value& old_value, Value& value) { + value.value_ = old_value.value_ + incr_; + } + inline bool RmwAtomic(Value& value) { + value.atomic_value_.fetch_add(incr_); + return true; + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + int64_t incr_; + Key key_; + }; + + class ReadContext : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + ReadContext(uint64_t key) + : key_{ key } { + } + + /// Copy (and deep-copy) constructor. + ReadContext(const ReadContext& other) + : key_{ other.key_ } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + + inline void Get(const Value& value) { + // All reads should be atomic (from the mutable tail). + ASSERT_TRUE(false); + } + inline void GetAtomic(const Value& value) { + output = value.atomic_value_.load(); + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + public: + int64_t output; + }; + + static constexpr size_t kNumThreads = 8; + static constexpr size_t kNumRmws = 2048; + static constexpr size_t kRange = 512; + + auto rmw_worker = [](FasterKv* store_, + int64_t incr) { + store_->StartSession(); + + for(size_t idx = 0; idx < kNumRmws; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + // In-memory test. + ASSERT_TRUE(false); + }; + RmwContext context{ idx % kRange, incr }; + Status result = store_->Rmw(context, callback, 1); + ASSERT_EQ(Status::Ok, result); + } + + store_->StopSession(); + }; + + FasterKv store{ 256, 1073741824, "" }; + + // Rmw, increment by 2 * idx. + std::deque threads{}; + for(int64_t idx = 0; idx < kNumThreads; ++idx) { + threads.emplace_back(rmw_worker, &store, 2 * idx); + } + for(auto& thread : threads) { + thread.join(); + } + + // Read. + store.StartSession(); + + for(size_t idx = 0; idx < kRange; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + // In-memory test. + ASSERT_TRUE(false); + }; + ReadContext context{ idx }; + Status result = store.Read(context, callback, 1); + ASSERT_EQ(Status::Ok, result) << idx; + // Should have performed 4 RMWs. + ASSERT_EQ((kNumThreads * (kNumThreads - 1)) * (kNumRmws / kRange), context.output); + } + + store.StopSession(); + + // Rmw, decrement by idx. + threads.clear(); + for(int64_t idx = 0; idx < kNumThreads; ++idx) { + threads.emplace_back(rmw_worker, &store, -idx); + } + for(auto& thread : threads) { + thread.join(); + } + + // Read again. + store.StartSession(); + + for(size_t idx = 0; idx < kRange; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + // In-memory test. + ASSERT_TRUE(false); + }; + ReadContext context{ static_cast(idx) }; + Status result = store.Read(context, callback, 1); + ASSERT_EQ(Status::Ok, result); + // All upserts should have inserts (non-atomic). + ASSERT_EQ(((kNumThreads * (kNumThreads - 1)) / 2) * (kNumRmws / kRange), context.output); + } + + store.StopSession(); +} + +TEST(InMemFaster, Rmw_ResizeValue_Concurrent) { + class Key { + public: + Key(uint64_t key) + : key_{ key } { + } + + inline static constexpr uint32_t size() { + return static_cast(sizeof(Key)); + } + inline KeyHash GetHash() const { + std::hash hash_fn; + return KeyHash{ hash_fn(key_) }; + } + + /// Comparison operators. + inline bool operator==(const Key& other) const { + return key_ == other.key_; + } + inline bool operator!=(const Key& other) const { + return key_ != other.key_; + } + + private: + uint64_t key_; + }; + + class RmwContext; + class ReadContext; + + class GenLock { + public: + GenLock() + : control_{ 0 } { + } + GenLock(uint64_t control) + : control_{ control } { + } + inline GenLock& operator=(const GenLock& other) { + control_ = other.control_; + return *this; + } + + union { + struct { + uint64_t gen_number : 62; + uint64_t locked : 1; + uint64_t replaced : 1; + }; + uint64_t control_; + }; + }; + static_assert(sizeof(GenLock) == 8, "sizeof(GenLock) != 8"); + + class AtomicGenLock { + public: + AtomicGenLock() + : control_{ 0 } { + } + AtomicGenLock(uint64_t control) + : control_{ control } { + } + + inline GenLock load() const { + return GenLock{ control_.load() }; + } + inline void store(GenLock desired) { + control_.store(desired.control_); + } + + inline bool try_lock(bool& replaced) { + replaced = false; + GenLock expected{ control_.load() }; + expected.locked = 0; + expected.replaced = 0; + GenLock desired{ expected.control_ }; + desired.locked = 1; + + if(control_.compare_exchange_strong(expected.control_, desired.control_)) { + return true; + } + if(expected.replaced) { + replaced = true; + } + return false; + } + inline void unlock(bool replaced) { + if(replaced) { + // Just turn off "locked" bit and increase gen number. + uint64_t sub_delta = ((uint64_t)1 << 62) - 1; + control_.fetch_sub(sub_delta); + } else { + // Turn off "locked" bit, turn on "replaced" bit, and increase gen number + uint64_t add_delta = ((uint64_t)1 << 63) - ((uint64_t)1 << 62) + 1; + control_.fetch_add(add_delta); + } + } + + private: + std::atomic control_; + }; + static_assert(sizeof(AtomicGenLock) == 8, "sizeof(AtomicGenLock) != 8"); + + class Value { + public: + Value() + : gen_lock_{ 0 } + , size_{ 0 } + , length_{ 0 } { + } + + inline uint32_t size() const { + return size_; + } + + friend class RmwContext; + friend class ReadContext; + + private: + AtomicGenLock gen_lock_; + uint32_t size_; + uint32_t length_; + + inline const int8_t* buffer() const { + return reinterpret_cast(this + 1); + } + inline int8_t* buffer() { + return reinterpret_cast(this + 1); + } + }; + + class RmwContext : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + RmwContext(uint64_t key, int8_t incr, uint32_t length) + : key_{ key } + , incr_{ incr } + , length_{ length } { + } + + /// Copy (and deep-copy) constructor. + RmwContext(const RmwContext& other) + : key_{ other.key_ } + , incr_{ other.incr_ } + , length_{ other.length_ } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + inline uint32_t value_size() const { + return sizeof(value_t) + length_; + } + + inline void RmwInitial(Value& value) { + value.gen_lock_.store(GenLock{}); + value.size_ = sizeof(Value) + length_; + value.length_ = length_; + std::memset(value.buffer(), incr_, length_); + } + inline void RmwCopy(const Value& old_value, Value& value) { + value.gen_lock_.store(GenLock{}); + value.size_ = sizeof(Value) + length_; + value.length_ = length_; + std::memset(value.buffer(), incr_, length_); + for(uint32_t idx = 0; idx < std::min(old_value.length_, length_); ++idx) { + value.buffer()[idx] = old_value.buffer()[idx] + incr_; + } + } + inline bool RmwAtomic(Value& value) { + bool replaced; + while(!value.gen_lock_.try_lock(replaced) && !replaced) { + std::this_thread::yield(); + } + if(replaced) { + // Some other thread replaced this record. + return false; + } + if(value.size_ < sizeof(Value) + length_) { + // Current value is too small for in-place update. + value.gen_lock_.unlock(true); + return false; + } + // In-place update overwrites length and buffer, but not size. + value.length_ = length_; + for(uint32_t idx = 0; idx < length_; ++idx) { + value.buffer()[idx] += incr_; + } + value.gen_lock_.unlock(false); + return true; + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + int8_t incr_; + uint32_t length_; + Key key_; + }; + + class ReadContext : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + ReadContext(uint64_t key) + : key_{ key } + , output_length{ 0 } { + } + + /// Copy (and deep-copy) constructor. + ReadContext(const ReadContext& other) + : key_{ other.key_ } + , output_length{ 0 } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + + inline void Get(const Value& value) { + // All reads should be atomic (from the mutable tail). + ASSERT_TRUE(false); + } + inline void GetAtomic(const Value& value) { + GenLock before, after; + do { + before = value.gen_lock_.load(); + output_length = value.length_; + output_bytes[0] = value.buffer()[0]; + output_bytes[1] = value.buffer()[value.length_ - 1]; + after = value.gen_lock_.load(); + } while(before.gen_number != after.gen_number); + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + public: + uint8_t output_length; + // Extract two bytes of output. + int8_t output_bytes[2]; + }; + + static constexpr int8_t kNumThreads = 8; + static constexpr size_t kNumRmws = 2048; + static constexpr size_t kRange = 512; + + auto rmw_worker = [](FasterKv* store_, + int8_t incr, uint32_t value_length) { + store_->StartSession(); + + for(size_t idx = 0; idx < kNumRmws; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + // In-memory test. + ASSERT_TRUE(false); + }; + RmwContext context{ idx % kRange, incr, value_length }; + Status result = store_->Rmw(context, callback, 1); + ASSERT_EQ(Status::Ok, result); + } + + store_->StopSession(); + }; + + FasterKv store{ 256, 1073741824, "" }; + + // Rmw, increment by 3. + std::deque threads{}; + for(int64_t idx = 0; idx < kNumThreads; ++idx) { + threads.emplace_back(rmw_worker, &store, 3, 5); + } + for(auto& thread : threads) { + thread.join(); + } + + // Read. + store.StartSession(); + + for(size_t idx = 0; idx < kRange; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + // In-memory test. + ASSERT_TRUE(false); + }; + ReadContext context{ idx }; + Status result = store.Read(context, callback, 1); + ASSERT_EQ(Status::Ok, result) << idx; + // Should have performed 4 RMWs. + ASSERT_EQ(5, context.output_length); + ASSERT_EQ(kNumThreads * 4 * 3, context.output_bytes[0]); + ASSERT_EQ(kNumThreads * 4 * 3, context.output_bytes[1]); + } + + store.StopSession(); + + // Rmw, decrement by 4. + threads.clear(); + for(int64_t idx = 0; idx < kNumThreads; ++idx) { + threads.emplace_back(rmw_worker, &store, -4, 8); + } + for(auto& thread : threads) { + thread.join(); + } + + // Read again. + store.StartSession(); + + for(size_t idx = 0; idx < kRange; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + // In-memory test. + ASSERT_TRUE(false); + }; + ReadContext context{ static_cast(idx) }; + Status result = store.Read(context, callback, 1); + ASSERT_EQ(Status::Ok, result); + // Should have performed 4 RMWs. + ASSERT_EQ(8, context.output_length); + ASSERT_EQ(kNumThreads * -4, context.output_bytes[0]); + ASSERT_EQ(kNumThreads * -16, context.output_bytes[1]); + } + + store.StopSession(); +} + +TEST(InMemFaster, GrowHashTable) { + class Key { + public: + Key(uint64_t key) + : key_{ key } { + } + + inline static constexpr uint32_t size() { + return static_cast(sizeof(Key)); + } + inline KeyHash GetHash() const { + std::hash hash_fn; + return KeyHash{ hash_fn(key_) }; + } + + /// Comparison operators. + inline bool operator==(const Key& other) const { + return key_ == other.key_; + } + inline bool operator!=(const Key& other) const { + return key_ != other.key_; + } + + private: + uint64_t key_; + }; + + class RmwContext; + class ReadContext; + + class Value { + public: + Value() + : value_{ 0 } { + } + Value(const Value& other) + : value_{ other.value_ } { + } + + inline static constexpr uint32_t size() { + return static_cast(sizeof(Value)); + } + + friend class RmwContext; + friend class ReadContext; + + private: + union { + int64_t value_; + std::atomic atomic_value_; + }; + }; + + class RmwContext : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + RmwContext(uint64_t key, int64_t incr) + : key_{ key } + , incr_{ incr } { + } + + /// Copy (and deep-copy) constructor. + RmwContext(const RmwContext& other) + : key_{ other.key_ } + , incr_{ other.incr_ } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + inline static constexpr uint32_t value_size() { + return sizeof(value_t); + } + + inline void RmwInitial(Value& value) { + value.value_ = incr_; + } + inline void RmwCopy(const Value& old_value, Value& value) { + value.value_ = old_value.value_ + incr_; + } + inline bool RmwAtomic(Value& value) { + value.atomic_value_.fetch_add(incr_); + return true; + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + int64_t incr_; + Key key_; + }; + + class ReadContext : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + ReadContext(uint64_t key) + : key_{ key } { + } + + /// Copy (and deep-copy) constructor. + ReadContext(const ReadContext& other) + : key_{ other.key_ } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + + inline void Get(const Value& value) { + // All reads should be atomic (from the mutable tail). + ASSERT_TRUE(false); + } + inline void GetAtomic(const Value& value) { + output = value.atomic_value_.load(); + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + public: + int64_t output; + }; + + static constexpr size_t kNumThreads = 8; + static constexpr size_t kNumRmws = 32768; + static constexpr size_t kRange = 8192; + + static std::atomic grow_done{ false }; + + auto rmw_worker0 = [](FasterKv* store_, + int64_t incr) { + auto callback = [](uint64_t new_size) { + grow_done = true; + }; + + store_->StartSession(); + + for(size_t idx = 0; idx < kNumRmws; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + // In-memory test. + ASSERT_TRUE(false); + }; + RmwContext context{ idx % kRange, incr }; + Status result = store_->Rmw(context, callback, 1); + ASSERT_EQ(Status::Ok, result); + } + + // Double the size of the index. + store_->GrowIndex(callback); + + while(!grow_done) { + store_->Refresh(); + std::this_thread::yield(); + } + + store_->StopSession(); + }; + + auto rmw_worker = [](FasterKv* store_, + int64_t incr) { + store_->StartSession(); + + for(size_t idx = 0; idx < kNumRmws; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + // In-memory test. + ASSERT_TRUE(false); + }; + RmwContext context{ idx % kRange, incr }; + Status result = store_->Rmw(context, callback, 1); + ASSERT_EQ(Status::Ok, result); + } + + while(!grow_done) { + store_->Refresh(); + std::this_thread::yield(); + } + + store_->StopSession(); + }; + + FasterKv store{ 256, 1073741824, "" }; + + // Rmw, increment by 2 * idx. + std::deque threads{}; + threads.emplace_back(rmw_worker0, &store, 0); + for(int64_t idx = 1; idx < kNumThreads; ++idx) { + threads.emplace_back(rmw_worker, &store, 2 * idx); + } + for(auto& thread : threads) { + thread.join(); + } + + // Read. + store.StartSession(); + + for(size_t idx = 0; idx < kRange; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + // In-memory test. + ASSERT_TRUE(false); + }; + ReadContext context{ idx }; + Status result = store.Read(context, callback, 1); + ASSERT_EQ(Status::Ok, result) << idx; + // Should have performed 4 RMWs. + ASSERT_EQ((kNumThreads * (kNumThreads - 1)) * (kNumRmws / kRange), context.output); + } + + store.StopSession(); + + // Rmw, decrement by idx. + grow_done = false; + threads.clear(); + threads.emplace_back(rmw_worker0, &store, 0); + for(int64_t idx = 1; idx < kNumThreads; ++idx) { + threads.emplace_back(rmw_worker, &store, -idx); + } + for(auto& thread : threads) { + thread.join(); + } + + // Read again. + store.StartSession(); + + for(size_t idx = 0; idx < kRange; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + // In-memory test. + ASSERT_TRUE(false); + }; + ReadContext context{ static_cast(idx) }; + Status result = store.Read(context, callback, 1); + ASSERT_EQ(Status::Ok, result); + // All upserts should have inserts (non-atomic). + ASSERT_EQ(((kNumThreads * (kNumThreads - 1)) / 2) * (kNumRmws / kRange), context.output); + } + + store.StopSession(); +} + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} \ No newline at end of file diff --git a/cc/test/malloc_fixed_page_size_test.cc b/cc/test/malloc_fixed_page_size_test.cc new file mode 100644 index 000000000..248ba4e83 --- /dev/null +++ b/cc/test/malloc_fixed_page_size_test.cc @@ -0,0 +1,81 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#include +#include +#include +#include "gtest/gtest.h" + +#include "core/light_epoch.h" +#include "core/malloc_fixed_page_size.h" +#include "device/null_disk.h" + +struct alignas(32) Item { + uint8_t buffer[32]; +}; + +using namespace FASTER::core; + +typedef MallocFixedPageSize alloc_t; + +TEST(MallocFixedPageSize, AllocFree) { + LightEpoch epoch; + alloc_t allocator{}; + allocator.Initialize(256, epoch); + for(size_t idx = 0; idx < 1000000; ++idx) { + FixedPageAddress address = allocator.Allocate(); + Item* item = &allocator.Get(address); + ASSERT_EQ(0, reinterpret_cast(item) % alignof(Item)); + allocator.FreeAtEpoch(address, 0); + } + ASSERT_EQ(1, allocator.free_list().size()); +} + +TEST(MallocFixedPageSize, Alloc) { + LightEpoch epoch; + alloc_t allocator{}; + allocator.Initialize(128, epoch); + for(size_t idx = 0; idx < 32000000; ++idx) { + FixedPageAddress address = allocator.Allocate(); + Item* item = &allocator.Get(address); + ASSERT_EQ(0, reinterpret_cast(item) % alignof(Item)); + } + ASSERT_EQ(0, allocator.free_list().size()); +} + + +static void MultiThread_Worker(alloc_t* allocator) { + constexpr size_t kAllocCount = 2000000; + FixedPageAddress* addresses = new FixedPageAddress[kAllocCount]; + + for(size_t idx = 0; idx < kAllocCount; ++idx) { + addresses[idx] = allocator->Allocate(); + Item* item = &allocator->Get(addresses[idx]); + ASSERT_EQ(0, reinterpret_cast(item) % alignof(Item)); + } + for(size_t idx = 0; idx < kAllocCount; ++idx) { + allocator->FreeAtEpoch(addresses[idx], idx); + } + ASSERT_EQ(kAllocCount, allocator->free_list().size()); + + delete[] addresses; +} + +TEST(MallocFixedPageSize, Concurrent) { + constexpr size_t kNumThreads = 16; + LightEpoch epoch; + alloc_t allocator{}; + allocator.Initialize(64, epoch); + std::deque threads{}; + for(size_t idx = 0; idx < kNumThreads; ++idx) { + threads.emplace_back(MultiThread_Worker, &allocator); + } + for(auto& thread : threads) { + thread.join(); + } +} + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} \ No newline at end of file diff --git a/cc/test/paging_queue_test.cc b/cc/test/paging_queue_test.cc new file mode 100644 index 000000000..212e0f966 --- /dev/null +++ b/cc/test/paging_queue_test.cc @@ -0,0 +1,27 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#include +#include +#include +#include +#include +#include +#include "gtest/gtest.h" +#include "core/faster.h" +#include "device/file_system_disk.h" + +using namespace FASTER::core; + +typedef FASTER::environment::QueueIoHandler handler_t; + +#define CLASS PagingTest_Queue + +#include "paging_test.h" + +#undef CLASS + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} \ No newline at end of file diff --git a/cc/test/paging_test.h b/cc/test/paging_test.h new file mode 100644 index 000000000..da8294d63 --- /dev/null +++ b/cc/test/paging_test.h @@ -0,0 +1,1017 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include + +using namespace FASTER; + +/// Disk's log uses 64 MB segments. +typedef FASTER::device::FileSystemDisk disk_t; + +TEST(CLASS, UpsertRead_Serial) { + class Key { + public: + Key(uint64_t pt1, uint64_t pt2) + : pt1_{ pt1 } + , pt2_{ pt2 } { + } + + inline static constexpr uint32_t size() { + return static_cast(sizeof(Key)); + } + inline KeyHash GetHash() const { + std::hash hash_fn; + return KeyHash{ hash_fn(pt1_) }; + } + + /// Comparison operators. + inline bool operator==(const Key& other) const { + return pt1_ == other.pt1_ && + pt2_ == other.pt2_; + } + inline bool operator!=(const Key& other) const { + return pt1_ != other.pt1_ || + pt2_ != other.pt2_; + } + + private: + uint64_t pt1_; + uint64_t pt2_; + }; + + class UpsertContext; + class ReadContext; + + class Value { + public: + Value() + : gen_{ 0 } + , value_{ 0 } + , length_{ 0 } { + } + + inline static constexpr uint32_t size() { + return static_cast(sizeof(Value)); + } + + friend class UpsertContext; + friend class ReadContext; + + private: + std::atomic gen_; + uint8_t value_[1014]; + uint16_t length_; + }; + static_assert(sizeof(Value) == 1024, "sizeof(Value) != 1024"); + static_assert(alignof(Value) == 8, "alignof(Value) != 8"); + + class UpsertContext : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + UpsertContext(const Key& key, uint8_t val) + : key_{ key } + , val_{ val } { + } + + /// Copy (and deep-copy) constructor. + UpsertContext(const UpsertContext& other) + : key_{ other.key_ } + , val_{ other.val_ } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + inline static constexpr uint32_t value_size() { + return sizeof(value_t); + } + /// Non-atomic and atomic Put() methods. + inline void Put(Value& value) { + value.gen_ = 0; + std::memset(value.value_, val_, val_); + value.length_ = val_; + } + inline bool PutAtomic(Value& value) { + // Get the lock on the value. + uint64_t expected_gen; + bool success; + do { + do { + // Spin until other the thread releases the lock. + expected_gen = value.gen_.load(); + } while(expected_gen == UINT64_MAX); + // Try to get the lock. + success = value.gen_.compare_exchange_weak(expected_gen, UINT64_MAX); + } while(!success); + + std::memset(value.value_, val_, val_); + value.length_ = val_; + // Increment the value's generation number. + value.gen_.store(expected_gen + 1); + return true; + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + uint8_t val_; + }; + + class ReadContext : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + ReadContext(Key key, uint8_t expected) + : key_{ key } + , expected_{ expected } { + } + + /// Copy (and deep-copy) constructor. + ReadContext(const ReadContext& other) + : key_{ other.key_ } + , expected_{ other.expected_ } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + + inline void Get(const Value& value) { + // This is a paging test, so we expect to read stuff from disk. + ASSERT_EQ(expected_, value.length_); + ASSERT_EQ(expected_, value.value_[expected_ - 5]); + } + inline void GetAtomic(const Value& value) { + uint64_t post_gen = value.gen_.load(); + uint64_t pre_gen; + uint16_t len; + uint8_t val; + do { + // Pre- gen # for this read is last read's post- gen #. + pre_gen = post_gen; + len = value.length_; + val = value.value_[len - 5]; + post_gen = value.gen_.load(); + } while(pre_gen != post_gen); + ASSERT_EQ(expected_, static_cast(len)); + ASSERT_EQ(expected_, val); + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + uint8_t expected_; + }; + + std::experimental::filesystem::create_directories("logs"); + + // 8 pages! + FasterKv store{ 262144, 268435456, "logs", 0.5 }; + + Guid session_id = store.StartSession(); + + constexpr size_t kNumRecords = 300000; + + // Insert. + for(size_t idx = 0; idx < kNumRecords; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + // Upserts don't go to disk. + ASSERT_TRUE(false); + }; + + if(idx % 256 == 0) { + store.Refresh(); + } + + UpsertContext context{ Key{idx, idx}, 25 }; + Status result = store.Upsert(context, callback, 1); + ASSERT_EQ(Status::Ok, result); + } + // Read. + static std::atomic records_read{ 0 }; + for(size_t idx = 0; idx < kNumRecords; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + ASSERT_EQ(Status::Ok, result); + ++records_read; + }; + + if(idx % 256 == 0) { + store.Refresh(); + } + + ReadContext context{ Key{ idx, idx}, 25 }; + Status result = store.Read(context, callback, 1); + if(result == Status::Ok) { + ++records_read; + } else { + ASSERT_EQ(Status::Pending, result); + } + } + + ASSERT_LT(records_read.load(), kNumRecords); + bool result = store.CompletePending(true); + ASSERT_TRUE(result); + ASSERT_EQ(kNumRecords, records_read.load()); + + // Update. + static std::atomic records_updated{ 0 }; + for(size_t idx = 0; idx < kNumRecords; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + // Upserts don't go to disk. + ASSERT_TRUE(false); + }; + + if(idx % 256 == 0) { + store.Refresh(); + } + + UpsertContext context{ Key{ idx, idx }, 87 }; + Status result = store.Upsert(context, callback, 1); + if(result == Status::Ok) { + ++records_updated; + } else { + ASSERT_EQ(Status::Pending, result); + } + } + + ASSERT_EQ(kNumRecords, records_updated.load()); + result = store.CompletePending(true); + ASSERT_TRUE(result); + + // Read again. + records_read = 0;; + for(size_t idx = 0; idx < kNumRecords; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + ASSERT_EQ(Status::Ok, result); + ++records_read; + }; + + if(idx % 256 == 0) { + store.Refresh(); + } + + ReadContext context{ Key{ idx, idx }, 87 }; + Status result = store.Read(context, callback, 1); + if(result == Status::Ok) { + ++records_read; + } else { + ASSERT_EQ(Status::Pending, result); + } + } + + ASSERT_LT(records_read.load(), kNumRecords); + result = store.CompletePending(true); + ASSERT_TRUE(result); + ASSERT_EQ(kNumRecords, records_read.load()); + + store.StopSession(); +} + +TEST(CLASS, UpsertRead_Concurrent) { + class UpsertContext; + class ReadContext; + + class Key { + public: + Key(uint64_t pt1, uint64_t pt2) + : pt1_{ pt1 } + , pt2_{ pt2 } { + } + + inline static constexpr uint32_t size() { + return static_cast(sizeof(Key)); + } + inline KeyHash GetHash() const { + std::hash hash_fn; + return KeyHash{ hash_fn(pt1_) }; + } + + /// Comparison operators. + inline bool operator==(const Key& other) const { + return pt1_ == other.pt1_ && + pt2_ == other.pt2_; + } + inline bool operator!=(const Key& other) const { + return pt1_ != other.pt1_ || + pt2_ != other.pt2_; + } + + friend class UpsertContext; + friend class ReadContext; + + private: + uint64_t pt1_; + uint64_t pt2_; + }; + + class Value { + public: + Value() + : gen_{ 0 } + , value_{ 0 } + , length_{ 0 } { + } + + inline static constexpr uint32_t size() { + return static_cast(sizeof(Value)); + } + + friend class UpsertContext; + friend class ReadContext; + + private: + std::atomic gen_; + uint8_t value_[1014]; + uint16_t length_; + }; + static_assert(sizeof(Value) == 1024, "sizeof(Value) != 1024"); + static_assert(alignof(Value) == 8, "alignof(Value) != 8"); + + class UpsertContext : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + UpsertContext(const Key& key, uint8_t val) + : key_{ key } + , val_{ val } { + } + + /// Copy (and deep-copy) constructor. + UpsertContext(const UpsertContext& other) + : key_{ other.key_ } + , val_{ other.val_ } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + inline static constexpr uint32_t value_size() { + return sizeof(value_t); + } + /// Non-atomic and atomic Put() methods. + inline void Put(Value& value) { + value.gen_ = 0; + std::memset(value.value_, val_, val_); + value.length_ = val_; + } + inline bool PutAtomic(Value& value) { + // Get the lock on the value. + uint64_t expected_gen; + bool success; + do { + do { + // Spin until other the thread releases the lock. + expected_gen = value.gen_.load(); + } while(expected_gen == UINT64_MAX); + // Try to get the lock. + success = value.gen_.compare_exchange_weak(expected_gen, UINT64_MAX); + } while(!success); + + std::memset(value.value_, val_, val_); + value.length_ = val_; + // Increment the value's generation number. + value.gen_.store(expected_gen + 1); + return true; + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + uint8_t val_; + }; + + class ReadContext : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + ReadContext(Key key, uint8_t expected) + : key_{ key } + , expected_{ expected } { + } + + /// Copy (and deep-copy) constructor. + ReadContext(const ReadContext& other) + : key_{ other.key_ } + , expected_{ other.expected_ } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + + inline void Get(const Value& value) { + // This is a paging test, so we expect to read stuff from disk. + ASSERT_EQ(expected_, value.length_); + ASSERT_EQ(expected_, value.value_[expected_ - 5]); + } + inline void GetAtomic(const Value& value) { + uint64_t post_gen = value.gen_.load(); + uint64_t pre_gen; + uint16_t len; + uint8_t val; + do { + // Pre- gen # for this read is last read's post- gen #. + pre_gen = post_gen; + len = value.length_; + val = value.value_[len - 5]; + post_gen = value.gen_.load(); + } while(pre_gen != post_gen); + ASSERT_EQ(expected_, val); + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + uint8_t expected_; + }; + + std::experimental::filesystem::create_directories("logs"); + + // 8 pages! + FasterKv store{ 262144, 268435456, "logs\\", 0.5 }; + + static constexpr size_t kNumRecords = 300000; + static constexpr size_t kNumThreads = 16; + + static std::atomic num_writes{ 0 }; + + auto upsert_worker = [](FasterKv* store_, + size_t thread_idx, uint8_t val) { + Guid session_id = store_->StartSession(); + + for(size_t idx = 0; idx < kNumRecords / kNumThreads; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + // In-memory test. + ASSERT_TRUE(false); + }; + + if(idx % 256 == 0) { + store_->Refresh(); + } + + uint64_t key_component = thread_idx * (kNumRecords / kNumThreads) + idx; + UpsertContext context{ Key{ key_component, key_component }, val }; + Status result = store_->Upsert(context, callback, 1); + ASSERT_EQ(Status::Ok, result); + ++num_writes; + } + + store_->StopSession(); + }; + + // Insert. + std::deque threads{}; + for(size_t idx = 0; idx < kNumThreads; ++idx) { + threads.emplace_back(upsert_worker, &store, idx, 25); + } + for(auto& thread : threads) { + thread.join(); + } + + ASSERT_EQ(kNumRecords, num_writes.load()); + + // Read. + Guid session_id = store.StartSession(); + + static std::atomic records_read{ 0 }; + for(size_t idx = 0; idx < kNumRecords; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + ASSERT_EQ(Status::Ok, result); + ++records_read; + }; + + if(idx % 256 == 0) { + store.Refresh(); + } + + ReadContext context{ Key{ idx, idx }, 25 }; + Status result = store.Read(context, callback, 1); + if(result == Status::Ok) { + ++records_read; + } else { + ASSERT_EQ(Status::Pending, result) << idx; + } + } + + ASSERT_LT(records_read.load(), kNumRecords); + bool result = store.CompletePending(true); + ASSERT_TRUE(result); + ASSERT_EQ(kNumRecords, records_read.load()); + + //// Update. + num_writes = 0; + threads.clear(); + for(size_t idx = 0; idx < kNumThreads; ++idx) { + threads.emplace_back(upsert_worker, &store, idx, 87); + } + for(auto& thread : threads) { + thread.join(); + } + + ASSERT_EQ(kNumRecords, num_writes.load()); + + // Delete some old copies of records (160 MB) that we no longer need. + static constexpr uint64_t kNewBeginAddress{ 167772160L }; + static std::atomic truncated{ false }; + static std::atomic complete{ false }; + auto truncate_callback = [](uint64_t offset) { + ASSERT_LE(offset, kNewBeginAddress); + truncated = true; + }; + auto complete_callback = []() { + complete = true; + }; + + result = store.ShiftBeginAddress(Address{ kNewBeginAddress }, truncate_callback, complete_callback); + ASSERT_TRUE(result); + + while(!truncated || !complete) { + store.CompletePending(false); + } + + // Read again. + records_read = 0;; + for(size_t idx = 0; idx < kNumRecords; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + ASSERT_EQ(Status::Ok, result); + ++records_read; + }; + + if(idx % 256 == 0) { + store.Refresh(); + } + + ReadContext context{ Key{ idx, idx }, 87 }; + Status result = store.Read(context, callback, 1); + if(result == Status::Ok) { + ++records_read; + } else { + ASSERT_EQ(Status::Pending, result); + } + } + + ASSERT_LT(records_read.load(), kNumRecords); + result = store.CompletePending(true); + ASSERT_TRUE(result); + ASSERT_EQ(kNumRecords, records_read.load()); + + store.StopSession(); +} + +TEST(CLASS, Rmw) { + class Key { + public: + Key(uint64_t key) + : key_{ key } { + } + + inline static constexpr uint32_t size() { + return static_cast(sizeof(Key)); + } + inline KeyHash GetHash() const { + return KeyHash{ Utility::GetHashCode(key_) }; + } + + /// Comparison operators. + inline bool operator==(const Key& other) const { + return key_ == other.key_; + } + inline bool operator!=(const Key& other) const { + return key_ != other.key_; + } + + private: + uint64_t key_; + }; + + class RmwContext; + + class Value { + public: + Value() + : counter_{ 0 } + , junk_{ 1 } { + } + + inline static constexpr uint32_t size() { + return static_cast(sizeof(Value)); + } + + friend class RmwContext; + + private: + std::atomic counter_; + uint8_t junk_[1016]; + }; + static_assert(sizeof(Value) == 1024, "sizeof(Value) != 1024"); + static_assert(alignof(Value) == 8, "alignof(Value) != 8"); + + class RmwContext : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + RmwContext(Key key, uint64_t incr) + : key_{ key } + , incr_{ incr } + , val_{ 0 } { + } + + /// Copy (and deep-copy) constructor. + RmwContext(const RmwContext& other) + : key_{ other.key_ } + , incr_{ other.incr_ } + , val_{ other.val_ } { + } + + inline const Key& key() const { + return key_; + } + inline static constexpr uint32_t value_size() { + return sizeof(value_t); + } + inline void RmwInitial(Value& value) { + value.counter_ = incr_; + val_ = value.counter_; + } + inline void RmwCopy(const Value& old_value, Value& value) { + value.counter_ = old_value.counter_ + incr_; + val_ = value.counter_; + } + inline bool RmwAtomic(Value& value) { + val_ = value.counter_.fetch_add(incr_) + incr_; + return true; + } + + inline uint64_t val() const { + return val_; + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + uint64_t incr_; + + uint64_t val_; + }; + + std::experimental::filesystem::create_directories("logs"); + + // 8 pages! + FasterKv store{ 262144, 268435456, "logs", 0.5 }; + + Guid session_id = store.StartSession(); + + constexpr size_t kNumRecords = 300000; + + // Initial RMW. + static std::atomic records_touched{ 0 }; + for(size_t idx = 0; idx < kNumRecords; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + ASSERT_EQ(Status::Ok, result); + ASSERT_EQ(3, context->val()); + ++records_touched; + }; + + if(idx % 256 == 0) { + store.Refresh(); + } + + RmwContext context{ Key{ idx }, 3 }; + Status result = store.Rmw(context, callback, 1); + if(result == Status::Ok) { + ASSERT_EQ(3, context.val()); + ++records_touched; + } else { + ASSERT_EQ(Status::Pending, result); + } + } + + bool result = store.CompletePending(true); + ASSERT_TRUE(result); + ASSERT_EQ(kNumRecords, records_touched.load()); + + // Second RMW. + records_touched = 0; + for(size_t idx = kNumRecords; idx > 0; --idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + ASSERT_EQ(Status::Ok, result); + ASSERT_EQ(8, context->val()); + ++records_touched; + }; + + if(idx % 256 == 0) { + store.Refresh(); + } + + RmwContext context{ Key{ idx - 1 }, 5 }; + Status result = store.Rmw(context, callback, 1); + if(result == Status::Ok) { + ASSERT_EQ(8, context.val()) << idx - 1; + ++records_touched; + } else { + ASSERT_EQ(Status::Pending, result); + } + } + + ASSERT_LT(records_touched.load(), kNumRecords); + result = store.CompletePending(true); + ASSERT_TRUE(result); + ASSERT_EQ(kNumRecords, records_touched.load()); + + store.StopSession(); +} + +TEST(CLASS, Rmw_Concurrent) { + class Key { + public: + Key(uint64_t key) + : key_{ key } { + } + + inline static constexpr uint32_t size() { + return static_cast(sizeof(Key)); + } + inline KeyHash GetHash() const { + return KeyHash{ Utility::GetHashCode(key_) }; + } + + /// Comparison operators. + inline bool operator==(const Key& other) const { + return key_ == other.key_; + } + inline bool operator!=(const Key& other) const { + return key_ != other.key_; + } + + private: + uint64_t key_; + }; + + class RmwContext; + class ReadContext; + + class Value { + public: + Value() + : counter_{ 0 } + , junk_{ 1 } { + } + + inline static constexpr uint32_t size() { + return static_cast(sizeof(Value)); + } + + friend class RmwContext; + friend class ReadContext; + + private: + std::atomic counter_; + uint8_t junk_[1016]; + }; + static_assert(sizeof(Value) == 1024, "sizeof(Value) != 1024"); + static_assert(alignof(Value) == 8, "alignof(Value) != 8"); + + class RmwContext : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + RmwContext(Key key, uint64_t incr) + : key_{ key } + , incr_{ incr } { + } + + /// Copy (and deep-copy) constructor. + RmwContext(const RmwContext& other) + : key_{ other.key_ } + , incr_{ other.incr_ } { + } + + inline const Key& key() const { + return key_; + } + inline static constexpr uint32_t value_size() { + return sizeof(value_t); + } + inline void RmwInitial(Value& value) { + value.counter_ = incr_; + } + inline void RmwCopy(const Value& old_value, Value& value) { + value.counter_ = old_value.counter_ + incr_; + } + inline bool RmwAtomic(Value& value) { + value.counter_.fetch_add(incr_); + return true; + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + uint64_t incr_; + }; + + class ReadContext : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + ReadContext(Key key) + : key_{ key } { + } + + /// Copy (and deep-copy) constructor. + ReadContext(const ReadContext& other) + : key_{ other.key_ } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + + inline void Get(const Value& value) { + counter = value.counter_.load(std::memory_order_acquire); + } + inline void GetAtomic(const Value& value) { + counter = value.counter_.load(); + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + public: + uint64_t counter; + }; + + static constexpr size_t kNumRecords = 300000; + static constexpr size_t kNumThreads = 8; + + auto rmw_worker = [](FasterKv* store_, uint64_t incr) { + Guid session_id = store_->StartSession(); + for(size_t idx = 0; idx < kNumRecords; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + ASSERT_EQ(Status::Ok, result); + }; + + if(idx % 256 == 0) { + store_->Refresh(); + } + + RmwContext context{ Key{ idx }, incr }; + Status result = store_->Rmw(context, callback, 1); + if(result != Status::Ok) { + ASSERT_EQ(Status::Pending, result); + } + } + bool result = store_->CompletePending(true); + ASSERT_TRUE(result); + store_->StopSession(); + }; + + auto read_worker1 = [](FasterKv* store_, size_t thread_idx) { + Guid session_id = store_->StartSession(); + for(size_t idx = 0; idx < kNumRecords / kNumThreads; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + ASSERT_EQ(Status::Ok, result); + ASSERT_EQ(7 * kNumThreads, context->counter); + }; + + if(idx % 256 == 0) { + store_->Refresh(); + } + + ReadContext context{ Key{ thread_idx* (kNumRecords / kNumThreads) + idx } }; + Status result = store_->Read(context, callback, 1); + if(result == Status::Ok) { + ASSERT_EQ(7 * kNumThreads, context.counter); + } else { + ASSERT_EQ(Status::Pending, result); + } + } + bool result = store_->CompletePending(true); + ASSERT_TRUE(result); + store_->StopSession(); + }; + + auto read_worker2 = [](FasterKv* store_, size_t thread_idx) { + Guid session_id = store_->StartSession(); + for(size_t idx = 0; idx < kNumRecords / kNumThreads; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + ASSERT_EQ(Status::Ok, result); + ASSERT_EQ(13 * kNumThreads, context->counter); + }; + + if(idx % 256 == 0) { + store_->Refresh(); + } + + ReadContext context{ Key{ thread_idx* (kNumRecords / kNumThreads) + idx } }; + Status result = store_->Read(context, callback, 1); + if(result == Status::Ok) { + ASSERT_EQ(13 * kNumThreads, context.counter); + } else { + ASSERT_EQ(Status::Pending, result); + } + } + bool result = store_->CompletePending(true); + ASSERT_TRUE(result); + store_->StopSession(); + }; + + std::experimental::filesystem::create_directories("logs"); + + // 8 pages! + FasterKv store{ 262144, 268435456, "logs\\", 0.5 }; + + // Initial RMW. + std::deque threads{}; + for(int64_t idx = 0; idx < kNumThreads; ++idx) { + threads.emplace_back(rmw_worker, &store, 7); + } + for(auto& thread : threads) { + thread.join(); + } + + // Read. + threads.clear(); + for(int64_t idx = 0; idx < kNumThreads; ++idx) { + threads.emplace_back(read_worker1, &store, idx); + } + for(auto& thread : threads) { + thread.join(); + } + + // Second RMW. + threads.clear(); + for(int64_t idx = 0; idx < kNumThreads; ++idx) { + threads.emplace_back(rmw_worker, &store, 6); + } + for(auto& thread : threads) { + thread.join(); + } + + // Read again. + threads.clear(); + for(int64_t idx = 0; idx < kNumThreads; ++idx) { + threads.emplace_back(read_worker2, &store, idx); + } + for(auto& thread : threads) { + thread.join(); + } +} diff --git a/cc/test/paging_threadpool_test.cc b/cc/test/paging_threadpool_test.cc new file mode 100644 index 000000000..79ea2dfe0 --- /dev/null +++ b/cc/test/paging_threadpool_test.cc @@ -0,0 +1,27 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#include +#include +#include +#include +#include +#include +#include "gtest/gtest.h" +#include "core/faster.h" +#include "device/file_system_disk.h" + +using namespace FASTER::core; + +typedef FASTER::environment::ThreadPoolIoHandler handler_t; + +#define CLASS PagingTest_ThreadPool + +#include "paging_test.h" + +#undef CLASS + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} \ No newline at end of file diff --git a/cc/test/recovery_queue_test.cc b/cc/test/recovery_queue_test.cc new file mode 100644 index 000000000..e04f3c7e0 --- /dev/null +++ b/cc/test/recovery_queue_test.cc @@ -0,0 +1,31 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#include +#include +#include +#include +#include +#include +#include +#include +#include "gtest/gtest.h" +#include "core/faster.h" +#include "core/light_epoch.h" +#include "core/thread.h" +#include "device/file_system_disk.h" + +using namespace FASTER::core; + +typedef FASTER::environment::QueueIoHandler handler_t; + +#define CLASS RecoveryTest_Queue + +#include "recovery_test.h" + +#undef CLASS + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} \ No newline at end of file diff --git a/cc/test/recovery_test.h b/cc/test/recovery_test.h new file mode 100644 index 000000000..e42988fa1 --- /dev/null +++ b/cc/test/recovery_test.h @@ -0,0 +1,3753 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include + +using namespace FASTER; + +/// Disk's log uses 32 MB segments. +typedef FASTER::device::FileSystemDisk disk_t; +typedef FASTER::device::FileSystemFile file_t; + +TEST(CLASS, MallocFixedPageSize) { + typedef MallocFixedPageSize alloc_t; + + // Test copied from C#, RecoveryTest.cs. + std::random_device rd{}; + uint32_t seed = rd(); + std::mt19937_64 rng{ seed }; + std::experimental::filesystem::create_directories("test_ofb"); + + size_t num_bytes_written; + + LightEpoch epoch; + alloc_t allocator{}; + allocator.Initialize(512, epoch); + + size_t num_buckets_to_add = 16 * FixedPage::kPageSize + 5; + + FixedPageAddress* buckets = new FixedPageAddress[num_buckets_to_add]; + + { + disk_t checkpoint_disk{ "test_ofb", epoch }; + file_t checkpoint_file = checkpoint_disk.NewFile("test_ofb.dat"); + Status result = checkpoint_file.Open(&checkpoint_disk.handler()); + ASSERT_EQ(Status::Ok, result); + + //do something + for(size_t bucket_idx = 0; bucket_idx < num_buckets_to_add; ++bucket_idx) { + buckets[bucket_idx] = allocator.Allocate(); + HashBucket& bucket = allocator.Get(buckets[bucket_idx]); + for(size_t entry_idx = 0; entry_idx < HashBucket::kNumEntries; ++entry_idx) { + HashBucketEntry expected{ 0 }; + uint64_t random_num = rng(); + bool success = bucket.entries[entry_idx].compare_exchange_strong(expected, random_num); + ASSERT_TRUE(success); + } + HashBucketOverflowEntry expected{ 0 }; + uint64_t random_num = rng(); + bool success = bucket.overflow_entry.compare_exchange_strong(expected, random_num); + ASSERT_TRUE(success); + } + //issue call to checkpoint + result = allocator.Checkpoint(checkpoint_disk, std::move(checkpoint_file), num_bytes_written); + ASSERT_EQ(Status::Ok, result); + // (All the bucket we allocated, + the null page.) + ASSERT_EQ((num_buckets_to_add + 1) * sizeof(HashBucket), num_bytes_written); + //wait until complete + result = allocator.CheckpointComplete(true); + ASSERT_EQ(Status::Ok, result); + } + + LightEpoch recover_epoch; + alloc_t recover_allocator{}; + recover_allocator.Initialize(512, recover_epoch); + disk_t recover_disk{ "test_ofb", recover_epoch }; + file_t recover_file = recover_disk.NewFile("test_ofb.dat"); + Status result = recover_file.Open(&recover_disk.handler()); + ASSERT_EQ(Status::Ok, result); + + //issue call to recover + result = recover_allocator.Recover(recover_disk, std::move(recover_file), num_bytes_written, + num_bytes_written / sizeof(typename alloc_t::item_t)); + ASSERT_EQ(Status::Ok, result); + //wait until complete + result = recover_allocator.RecoverComplete(true); + ASSERT_EQ(Status::Ok, result); + + //verify that something + std::mt19937_64 rng2{ seed }; + for(size_t bucket_idx = 0; bucket_idx < num_buckets_to_add; ++bucket_idx) { + HashBucket& bucket = allocator.Get(buckets[bucket_idx]); + for(size_t entry_idx = 0; entry_idx < HashBucket::kNumEntries; ++entry_idx) { + uint64_t random_num = rng2(); + ASSERT_EQ(random_num, bucket.entries[entry_idx].load().control_); + } + uint64_t random_num = rng2(); + ASSERT_EQ(random_num, bucket.overflow_entry.load().control_); + } + + FixedPageAddress address = recover_allocator.Allocate(); + ASSERT_EQ(FixedPageAddress{ num_buckets_to_add + 1 }, address); + + delete[] buckets; +} + +TEST(CLASS, InternalHashTable) { + // (Just the hash table itself--no overflow buckets.) + std::random_device rd{}; + uint32_t seed = rd(); + std::mt19937_64 rng{ seed }; + std::experimental::filesystem::create_directories("test_ht"); + + constexpr uint64_t kNumBuckets = 8388608; + size_t num_bytes_written; + { + LightEpoch epoch; + disk_t checkpoint_disk{ "test_ht", epoch }; + file_t checkpoint_file = checkpoint_disk.NewFile("test_ht.dat"); + Status result = checkpoint_file.Open(&checkpoint_disk.handler()); + ASSERT_EQ(Status::Ok, result); + + InternalHashTable table{}; + table.Initialize(kNumBuckets, checkpoint_file.alignment()); + + //do something + for(size_t bucket_idx = 0; bucket_idx < kNumBuckets; ++bucket_idx) { + for(size_t entry_idx = 0; entry_idx < HashBucket::kNumEntries; ++entry_idx) { + HashBucketEntry expected{ 0 }; + bool success = table.bucket(bucket_idx).entries[entry_idx].compare_exchange_strong( + expected, rng()); + ASSERT_TRUE(success); + } + HashBucketOverflowEntry expected{ 0 }; + bool success = table.bucket(bucket_idx).overflow_entry.compare_exchange_strong(expected, + rng()); + ASSERT_TRUE(success); + } + + //issue call to checkpoint + result = table.Checkpoint(checkpoint_disk, std::move(checkpoint_file), num_bytes_written); + ASSERT_EQ(Status::Ok, result); + // (All the bucket we allocated, + the null page.) + ASSERT_EQ(kNumBuckets * sizeof(HashBucket), num_bytes_written); + //wait until complete + result = table.CheckpointComplete(true); + ASSERT_EQ(Status::Ok, result); + } + + LightEpoch epoch; + disk_t recover_disk{ "test_ht", epoch }; + file_t recover_file = recover_disk.NewFile("test_ht.dat"); + Status result = recover_file.Open(&recover_disk.handler()); + ASSERT_EQ(Status::Ok, result); + + InternalHashTable recover_table{}; + //issue call to recover + result = recover_table.Recover(recover_disk, std::move(recover_file), num_bytes_written); + ASSERT_EQ(Status::Ok, result); + //wait until complete + result = recover_table.RecoverComplete(true); + ASSERT_EQ(Status::Ok, result); + + //verify that something + std::mt19937_64 rng2{ seed }; + for(size_t bucket_idx = 0; bucket_idx < kNumBuckets; ++bucket_idx) { + for(size_t entry_idx = 0; entry_idx < HashBucket::kNumEntries; ++entry_idx) { + uint64_t random_num = rng2(); + ASSERT_EQ(random_num, recover_table.bucket(bucket_idx).entries[entry_idx].load().control_); + } + uint64_t random_num = rng2(); + ASSERT_EQ(random_num, recover_table.bucket(bucket_idx).overflow_entry.load().control_); + } +} + +TEST(CLASS, Serial) { + class Key { + public: + Key(uint32_t key) + : key_{ key } { + } + + inline static constexpr uint32_t size() { + return static_cast(sizeof(Key)); + } + inline KeyHash GetHash() const { + std::hash hash_fn{}; + return KeyHash{ hash_fn(key_) }; + } + + /// Comparison operators. + inline bool operator==(const Key& other) const { + return key_ == other.key_; + } + inline bool operator!=(const Key& other) const { + return key_ != other.key_; + } + + private: + uint32_t key_; + }; + static_assert(sizeof(Key) == 4, "sizeof(Key) != 4"); + static_assert(alignof(Key) == 4, "alignof(Key) != 4"); + + class UpsertContext1; + class UpsertContext2; + class ReadContext1; + class ReadContext2; + + class Value1 { + public: + inline uint32_t size() const { + return size_; + } + + friend class UpsertContext1; + friend class UpsertContext2; + friend class ReadContext1; + + private: + uint16_t size_; + union { + std::atomic atomic_val1_; + uint32_t val1_; + }; + }; + static_assert(sizeof(Value1) == 8, "sizeof(Value1) != 8"); + static_assert(alignof(Value1) == 4, "alignof(Value1) != 4"); + + class Value2 : public Value1 { + public: + friend class UpsertContext2; + friend class ReadContext2; + + private: + union { + std::atomic atomic_val2_; + uint16_t val2_; + }; + uint8_t wasted_space[3]; + }; + static_assert(sizeof(Value2) == 16, "sizeof(Value2) != 12"); + static_assert(alignof(Value2) == 4, "alignof(Value2) != 4"); + + class UpsertContext1 : public IAsyncContext { + public: + typedef Key key_t; + typedef Value1 value_t; + + UpsertContext1(const Key& key, uint32_t val) + : key_{ key } + , val_{ val } { + } + + /// Copy (and deep-copy) constructor. + UpsertContext1(const UpsertContext1& other) + : key_{ other.key_ } + , val_{ other.val_ } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + inline static constexpr uint32_t value_size() { + return sizeof(value_t); + } + /// Non-atomic and atomic Put() methods. + inline void Put(Value1& value) { + value.size_ = sizeof(value); + value.val1_ = val_; + } + inline bool PutAtomic(Value1& value) { + EXPECT_EQ(value.size_, sizeof(value)); + value.atomic_val1_.store(val_); + return true; + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + uint32_t val_; + }; + + class UpsertContext2 : public IAsyncContext { + public: + typedef Key key_t; + typedef Value2 value_t; + + UpsertContext2(const Key& key, uint16_t val) + : key_{ key } + , val_{ val } { + } + + /// Copy (and deep-copy) constructor. + UpsertContext2(const UpsertContext2& other) + : key_{ other.key_ } + , val_{ other.val_ } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + inline static constexpr uint32_t value_size() { + return sizeof(value_t); + } + /// Non-atomic and atomic Put() methods. + inline void Put(Value2& value) { + value.size_ = sizeof(value); + value.val2_ = val_; + } + inline bool PutAtomic(Value2& value) { + EXPECT_EQ(value.size_, sizeof(value)); + value.atomic_val2_.store(val_); + return true; + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + uint16_t val_; + }; + + class ReadContext1 : public IAsyncContext { + public: + typedef Key key_t; + typedef Value1 value_t; + + ReadContext1(Key key, uint32_t expected_) + : key_{ key } + , val_{ 0 } + , expected{ expected_ } { + } + + /// Copy (and deep-copy) constructor. + ReadContext1(const ReadContext1& other) + : key_{ other.key_ } + , val_{ other.val_ } + , expected{ other.expected } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + + inline void Get(const Value1& value) { + val_ = value.val1_; + } + inline void GetAtomic(const Value1& value) { + val_ = value.atomic_val1_.load(); + } + + uint64_t val() const { + return val_; + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + uint32_t val_; + public: + const uint32_t expected; + }; + + class ReadContext2 : public IAsyncContext { + public: + typedef Key key_t; + typedef Value2 value_t; + + ReadContext2(Key key, uint16_t expected_) + : key_{ key } + , val_{ 0 } + , expected{ expected_ } { + } + + /// Copy (and deep-copy) constructor. + ReadContext2(const ReadContext2& other) + : key_{ other.key_ } + , val_{ other.val_ } + , expected{ other.expected } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + + inline void Get(const Value2& value) { + val_ = value.val2_; + } + inline void GetAtomic(const Value2& value) { + val_ = value.atomic_val2_.load(); + } + + uint64_t val() const { + return val_; + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + uint16_t val_; + public: + const uint16_t expected; + }; + + auto upsert_callback = [](IAsyncContext* context, Status result) { + // Upserts don't go to disk. + ASSERT_TRUE(false); + }; + + std::experimental::filesystem::create_directories("storage"); + + static constexpr size_t kNumRecords = 6000000; + + Guid session_id; + + { + // Populate and checkpoint the store. + // 6 pages! + FasterKv store{ 524288, 201326592, "storage", 0.4 }; + + session_id = store.StartSession(); + + // upsert some records + assert(kNumRecords % 2 == 0); + for(uint32_t idx = 0; idx < kNumRecords; idx += 2) { + { + UpsertContext1 context{ Key{ idx }, idx + 7 }; + Status result = store.Upsert(context, upsert_callback, 1); + ASSERT_EQ(Status::Ok, result); + } + { + UpsertContext2 context{ Key{ idx + 1 }, 55 }; + Status result = store.Upsert(context, upsert_callback, 1); + ASSERT_EQ(Status::Ok, result); + } + } + // verify them + static std::atomic records_read; + records_read = 0; + for(uint32_t idx = 0; idx < kNumRecords; idx += 2) { + auto callback1 = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + ASSERT_EQ(Status::Ok, result); + ++records_read; + ASSERT_EQ(context->expected, context->val()); + }; + auto callback2 = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + ASSERT_EQ(Status::Ok, result); + ++records_read; + ASSERT_EQ(context->expected, context->val()); + }; + + if(idx % 256 == 0) { + store.Refresh(); + store.CompletePending(false); + } + + { + ReadContext1 context{ Key{ idx }, idx + 7 }; + Status result = store.Read(context, callback1, 1); + if(result == Status::Ok) { + ++records_read; + ASSERT_EQ(context.expected, context.val()); + } else { + ASSERT_EQ(Status::Pending, result); + } + } + { + ReadContext2 context{ Key{ idx + 1 }, 55 }; + Status result = store.Read(context, callback2, 1); + if(result == Status::Ok) { + ++records_read; + ASSERT_EQ(context.expected, context.val()); + } else { + ASSERT_EQ(Status::Pending, result); + } + } + } + + static std::atomic num_threads_persistent; + num_threads_persistent = 0; + static std::atomic threads_persistent[Thread::kMaxNumThreads]; + for(size_t idx = 0; idx < Thread::kMaxNumThreads; ++idx) { + threads_persistent[idx] = false; + } + + auto persistence_callback = [](uint64_t persistent_serial_num) { + bool expected = false; + ASSERT_TRUE(threads_persistent[Thread::id()].compare_exchange_strong(expected, + true)); + ++num_threads_persistent; + }; + + // checkpoint (transition from REST to INDEX_CHKPT) + ASSERT_TRUE(store.Checkpoint(persistence_callback)); + + while(num_threads_persistent < 1) { + store.CompletePending(false); + } + + bool result = store.CompletePending(true); + ASSERT_TRUE(result); + ASSERT_EQ(kNumRecords, records_read.load()); + + store.StopSession(); + } + + // Test recovery. + FasterKv new_store{ 524288, 201326592, "storage", 0.4 }; + + std::vector session_ids; + Status status = new_store.Recover(1, 1, session_ids); + ASSERT_EQ(Status::Ok, status); + ASSERT_EQ(1, session_ids.size()); + ASSERT_EQ(session_id, session_ids[0]); + ASSERT_EQ(1, new_store.ContinueSession(session_id)); + + // Verify the recovered store. + static std::atomic records_read; + records_read = 0; + for(uint32_t idx = 0; idx < kNumRecords; idx += 2) { + auto callback1 = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + ASSERT_EQ(Status::Ok, result) << *reinterpret_cast(&context->key()); + ++records_read; + ASSERT_EQ(context->expected, context->val()); + }; + auto callback2 = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + ASSERT_EQ(Status::Ok, result) << *reinterpret_cast(&context->key()); + ++records_read; + ASSERT_EQ(context->expected, context->val()); + }; + + if(idx % 256 == 0) { + new_store.Refresh(); + new_store.CompletePending(false); + } + + { + ReadContext1 context{ Key{ idx }, idx + 7 }; + Status result = new_store.Read(context, callback1, 1); + if(result == Status::Ok) { + ++records_read; + ASSERT_EQ(context.expected, context.val()); + } else { + ASSERT_EQ(Status::Pending, result); + } + } + { + ReadContext2 context{ Key{ idx + 1 }, 55 }; + Status result = new_store.Read(context, callback2, 1); + if(result == Status::Ok) { + ++records_read; + ASSERT_EQ(context.expected, context.val()); + } else { + ASSERT_EQ(Status::Pending, result); + } + } + } + + new_store.CompletePending(true); + ASSERT_EQ(records_read.load(), kNumRecords); + new_store.StopSession(); + + session_id = new_store.StartSession(); + + // Upsert some changes and verify them. + for(uint32_t idx = 0; idx < kNumRecords; idx += 2) { + { + UpsertContext1 context{ Key{ idx }, idx + 55 }; + Status result = new_store.Upsert(context, upsert_callback, 1); + ASSERT_EQ(Status::Ok, result); + } + { + UpsertContext2 context{ Key{ idx + 1 }, 77 }; + Status result = new_store.Upsert(context, upsert_callback, 1); + ASSERT_EQ(Status::Ok, result); + } + } + records_read = 0; + for(uint32_t idx = 0; idx < kNumRecords; idx += 2) { + auto callback1 = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + ASSERT_EQ(Status::Ok, result); + ++records_read; + ASSERT_EQ(context->expected, context->val()); + }; + auto callback2 = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + ASSERT_EQ(Status::Ok, result); + ++records_read; + ASSERT_EQ(context->expected, context->val()); + }; + + if(idx % 256 == 0) { + new_store.Refresh(); + new_store.CompletePending(false); + } + + { + ReadContext1 context{ Key{ idx }, idx + 55 }; + Status result = new_store.Read(context, callback1, 1); + if(result == Status::Ok) { + ++records_read; + ASSERT_EQ(context.expected, context.val()); + } else { + ASSERT_EQ(Status::Pending, result); + } + } + { + ReadContext2 context{ Key{ idx + 1 }, 77 }; + Status result = new_store.Read(context, callback2, 1); + if(result == Status::Ok) { + ++records_read; + ASSERT_EQ(context.expected, context.val()); + } else { + ASSERT_EQ(Status::Pending, result); + } + } + } + + new_store.CompletePending(true); + ASSERT_EQ(records_read.load(), kNumRecords); + new_store.StopSession(); +} + +TEST(CLASS, Serial_VariableLengthKey) { + class alignas(4) Key { + public: + Key(uint8_t len, uint32_t fill) + : len_{ len } { + for(uint8_t idx = 0; idx < len_; ++idx) { + buffer()[idx] = fill; + } + } + + /// Copy constructor. + Key(const Key& other) + : len_{ other.len_ } { + std::memcpy(buffer(), other.buffer(), len_ * sizeof(uint32_t)); + } + + inline uint32_t size() const { + return sizeof(*this) + (len_ * sizeof(uint32_t)); + } + private: + inline uint32_t* buffer() { + return reinterpret_cast(this + 1); + } + public: + inline const uint32_t* buffer() const { + return reinterpret_cast(this + 1); + } + inline KeyHash GetHash() const { + return KeyHash{ Utility::HashBytes( + reinterpret_cast(buffer()), len_ * 2) }; + } + + /// Comparison operators. + inline bool operator==(const Key& other) const { + return len_ == other.len_ && + std::memcmp(buffer(), other.buffer(), len_ * sizeof(uint32_t)) == 0; + } + inline bool operator!=(const Key& other) const { + return len_ != other.len_ || + std::memcmp(buffer(), other.buffer(), len_ * sizeof(uint32_t)) != 0; + } + + private: + uint8_t len_; + + }; + static_assert(sizeof(Key) == 4, "sizeof(Key) != 4"); + static_assert(alignof(Key) == 4, "alignof(Key) != 4"); + + class UpsertContext1; + class UpsertContext2; + class ReadContext1; + class ReadContext2; + + class Value1 { + public: + inline uint32_t size() const { + return size_; + } + + friend class UpsertContext1; + friend class UpsertContext2; + friend class ReadContext1; + + private: + uint16_t size_; + union { + std::atomic atomic_val1_; + uint32_t val1_; + }; + }; + static_assert(sizeof(Value1) == 8, "sizeof(Value1) != 8"); + static_assert(alignof(Value1) == 4, "alignof(Value1) != 4"); + + class Value2 : public Value1 { + public: + friend class UpsertContext2; + friend class ReadContext2; + + private: + union { + std::atomic atomic_val2_; + uint16_t val2_; + }; + uint8_t wasted_space[3]; + }; + static_assert(sizeof(Value2) == 16, "sizeof(Value2) != 12"); + static_assert(alignof(Value2) == 4, "alignof(Value2) != 4"); + + class UpsertContext1 : public IAsyncContext { + public: + typedef Key key_t; + typedef Value1 value_t; + + UpsertContext1(uint32_t key, uint32_t val) + : val_{ val } { + uint8_t len = (key % 16) + 1; + key_ = alloc_context(sizeof(key_t) + (len * sizeof(uint32_t))); + new(key_.get()) key_t{ len, key }; + } + + /// Deep-copy constructor. + UpsertContext1(UpsertContext1& other) + : key_{ std::move(other.key_) } + , val_{ other.val_ } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return *key_.get(); + } + inline static constexpr uint32_t value_size() { + return sizeof(value_t); + } + /// Non-atomic and atomic Put() methods. + inline void Put(Value1& value) { + value.size_ = sizeof(value); + value.val1_ = val_; + } + inline bool PutAtomic(Value1& value) { + EXPECT_EQ(value.size_, sizeof(value)); + value.atomic_val1_.store(val_); + return true; + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + context_unique_ptr_t key_; + uint32_t val_; + }; + + class UpsertContext2 : public IAsyncContext { + public: + typedef Key key_t; + typedef Value2 value_t; + + UpsertContext2(uint32_t key, uint16_t val) + : val_{ val } { + uint8_t len = (key % 16) + 1; + key_ = alloc_context(sizeof(key_t) + (len * sizeof(uint32_t))); + new(key_.get()) key_t{ len, key }; + } + + /// Deep-copy constructor. + UpsertContext2(UpsertContext2& other) + : key_{ std::move(other.key_) } + , val_{ other.val_ } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return *key_.get(); + } + inline static constexpr uint32_t value_size() { + return sizeof(value_t); + } + /// Non-atomic and atomic Put() methods. + inline void Put(Value2& value) { + value.size_ = sizeof(value); + value.val2_ = val_; + } + inline bool PutAtomic(Value2& value) { + EXPECT_EQ(value.size_, sizeof(value)); + value.atomic_val2_.store(val_); + return true; + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + context_unique_ptr_t key_; + uint16_t val_; + }; + + class ReadContext1 : public IAsyncContext { + public: + typedef Key key_t; + typedef Value1 value_t; + + ReadContext1(uint32_t key, uint32_t expected_) + : val_{ 0 } + , expected{ expected_ } { + uint8_t len = (key % 16) + 1; + key_ = alloc_context(sizeof(key_t) + (len * sizeof(uint32_t))); + new(key_.get()) key_t{ len, key }; + } + + /// Deep-copy constructor. + ReadContext1(ReadContext1& other) + : key_{ std::move(other.key_) } + , val_{ other.val_ } + , expected{ other.expected } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return *key_.get(); + } + + inline void Get(const Value1& value) { + val_ = value.val1_; + } + inline void GetAtomic(const Value1& value) { + val_ = value.atomic_val1_.load(); + } + + uint64_t val() const { + return val_; + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + context_unique_ptr_t key_; + uint32_t val_; + public: + const uint32_t expected; + }; + + class ReadContext2 : public IAsyncContext { + public: + typedef Key key_t; + typedef Value2 value_t; + + ReadContext2(uint32_t key, uint16_t expected_) + : val_{ 0 } + , expected{ expected_ } { + uint8_t len = (key % 16) + 1; + key_ = alloc_context(sizeof(key_t) + (len * sizeof(uint32_t))); + new(key_.get()) key_t{ len, key }; + } + + /// Deep-copy constructor. + ReadContext2(ReadContext2& other) + : key_{ std::move(other.key_) } + , val_{ other.val_ } + , expected{ other.expected } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return *key_.get(); + } + + inline void Get(const Value2& value) { + val_ = value.val2_; + } + inline void GetAtomic(const Value2& value) { + val_ = value.atomic_val2_.load(); + } + + uint64_t val() const { + return val_; + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + context_unique_ptr_t key_; + uint16_t val_; + public: + const uint16_t expected; + }; + + auto upsert_callback = [](IAsyncContext* context, Status result) { + // Upserts don't go to disk. + ASSERT_TRUE(false); + }; + + std::experimental::filesystem::create_directories("storage"); + + static constexpr size_t kNumRecords = 6000000; + + Guid session_id; + + { + // Populate and checkpoint the store. + // 6 pages! + FasterKv store{ 524288, 201326592, "storage", 0.4 }; + + session_id = store.StartSession(); + + // upsert some records + assert(kNumRecords % 2 == 0); + for(uint32_t idx = 0; idx < kNumRecords; idx += 2) { + { + UpsertContext1 context{ idx, idx + 7 }; + Status result = store.Upsert(context, upsert_callback, 1); + ASSERT_EQ(Status::Ok, result); + } + { + UpsertContext2 context{ idx + 1, 55 }; + Status result = store.Upsert(context, upsert_callback, 1); + ASSERT_EQ(Status::Ok, result); + } + } + // verify them + static std::atomic records_read; + records_read = 0; + for(uint32_t idx = 0; idx < kNumRecords; idx += 2) { + auto callback1 = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + ASSERT_EQ(Status::Ok, result); + ++records_read; + ASSERT_EQ(context->expected, context->val()); + }; + auto callback2 = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + ASSERT_EQ(Status::Ok, result); + ++records_read; + ASSERT_EQ(context->expected, context->val()); + }; + + if(idx % 256 == 0) { + store.Refresh(); + store.CompletePending(false); + } + + { + ReadContext1 context{ idx, idx + 7 }; + Status result = store.Read(context, callback1, 1); + if(result == Status::Ok) { + ++records_read; + ASSERT_EQ(context.expected, context.val()); + } else { + ASSERT_EQ(Status::Pending, result); + } + } + { + ReadContext2 context{ idx + 1, 55 }; + Status result = store.Read(context, callback2, 1); + if(result == Status::Ok) { + ++records_read; + ASSERT_EQ(context.expected, context.val()); + } else { + ASSERT_EQ(Status::Pending, result); + } + } + } + + static std::atomic num_threads_persistent; + num_threads_persistent = 0; + static std::atomic threads_persistent[Thread::kMaxNumThreads]; + for(size_t idx = 0; idx < Thread::kMaxNumThreads; ++idx) { + threads_persistent[idx] = false; + } + + auto persistence_callback = [](uint64_t persistent_serial_num) { + bool expected = false; + ASSERT_TRUE(threads_persistent[Thread::id()].compare_exchange_strong(expected, + true)); + ++num_threads_persistent; + }; + + // checkpoint (transition from REST to INDEX_CHKPT) + ASSERT_TRUE(store.Checkpoint(persistence_callback)); + + while(num_threads_persistent < 1) { + store.CompletePending(false); + } + + bool result = store.CompletePending(true); + ASSERT_TRUE(result); + ASSERT_EQ(kNumRecords, records_read.load()); + + store.StopSession(); + } + + // Test recovery. + FasterKv new_store{ 524288, 201326592, "storage", 0.4 }; + + std::vector session_ids; + Status status = new_store.Recover(1, 1, session_ids); + ASSERT_EQ(Status::Ok, status); + ASSERT_EQ(1, session_ids.size()); + ASSERT_EQ(session_id, session_ids[0]); + ASSERT_EQ(1, new_store.ContinueSession(session_id)); + + // Verify the recovered store. + static std::atomic records_read; + records_read = 0; + for(uint32_t idx = 0; idx < kNumRecords; idx += 2) { + auto callback1 = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + ASSERT_EQ(Status::Ok, result) << *reinterpret_cast(&context->key()); + ++records_read; + ASSERT_EQ(context->expected, context->val()); + }; + auto callback2 = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + ASSERT_EQ(Status::Ok, result) << *reinterpret_cast(&context->key()); + ++records_read; + ASSERT_EQ(context->expected, context->val()); + }; + + if(idx % 256 == 0) { + new_store.Refresh(); + new_store.CompletePending(false); + } + + { + ReadContext1 context{ idx, idx + 7 }; + Status result = new_store.Read(context, callback1, 1); + if(result == Status::Ok) { + ++records_read; + ASSERT_EQ(context.expected, context.val()); + } else { + ASSERT_EQ(Status::Pending, result); + } + } + { + ReadContext2 context{ idx + 1, 55 }; + Status result = new_store.Read(context, callback2, 1); + if(result == Status::Ok) { + ++records_read; + ASSERT_EQ(context.expected, context.val()); + } else { + ASSERT_EQ(Status::Pending, result); + } + } + } + + new_store.CompletePending(true); + ASSERT_EQ(records_read.load(), kNumRecords); + new_store.StopSession(); + + session_id = new_store.StartSession(); + + // Upsert some changes and verify them. + for(uint32_t idx = 0; idx < kNumRecords; idx += 2) { + { + UpsertContext1 context{ idx, idx + 55 }; + Status result = new_store.Upsert(context, upsert_callback, 1); + ASSERT_EQ(Status::Ok, result); + } + { + UpsertContext2 context{ idx + 1, 77 }; + Status result = new_store.Upsert(context, upsert_callback, 1); + ASSERT_EQ(Status::Ok, result); + } + } + records_read = 0; + for(uint32_t idx = 0; idx < kNumRecords; idx += 2) { + auto callback1 = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + ASSERT_EQ(Status::Ok, result); + ++records_read; + ASSERT_EQ(context->expected, context->val()); + }; + auto callback2 = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + ASSERT_EQ(Status::Ok, result); + ++records_read; + ASSERT_EQ(context->expected, context->val()); + }; + + if(idx % 256 == 0) { + new_store.Refresh(); + new_store.CompletePending(false); + } + + { + ReadContext1 context{ idx, idx + 55 }; + Status result = new_store.Read(context, callback1, 1); + if(result == Status::Ok) { + ++records_read; + ASSERT_EQ(context.expected, context.val()); + } else { + ASSERT_EQ(Status::Pending, result); + } + } + { + ReadContext2 context{ idx + 1, 77 }; + Status result = new_store.Read(context, callback2, 1); + if(result == Status::Ok) { + ++records_read; + ASSERT_EQ(context.expected, context.val()); + } else { + ASSERT_EQ(Status::Pending, result); + } + } + } + + new_store.CompletePending(true); + ASSERT_EQ(records_read.load(), kNumRecords); + new_store.StopSession(); +} + +TEST(CLASS, Concurrent_Insert_Small) { + class Key { + public: + Key(uint32_t key) + : key_{ key } { + } + + inline static constexpr uint32_t size() { + return static_cast(sizeof(Key)); + } + inline KeyHash GetHash() const { + std::hash hash_fn{}; + return KeyHash{ hash_fn(key_) }; + } + + /// Comparison operators. + inline bool operator==(const Key& other) const { + return key_ == other.key_; + } + inline bool operator!=(const Key& other) const { + return key_ != other.key_; + } + + private: + uint32_t key_; + }; + static_assert(sizeof(Key) == 4, "sizeof(Key) != 4"); + static_assert(alignof(Key) == 4, "alignof(Key) != 4"); + + class UpsertContext; + class ReadContext1; + class ReadContext2; + + class Value { + public: + Value() + : val_{ 0 } { + } + + inline static constexpr uint32_t size() { + return static_cast(sizeof(Value)); + } + + friend class UpsertContext; + friend class ReadContext1; + friend class ReadContext2; + + private: + union { + std::atomic atomic_val_; + uint32_t val_; + }; + }; + static_assert(sizeof(Value) == 4, "sizeof(Value) != 4"); + static_assert(alignof(Value) == 4, "alignof(Value) != 4"); + + class UpsertContext : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + UpsertContext(const Key& key, uint32_t val) + : key_{ key } + , val_{ val } { + } + + /// Copy (and deep-copy) constructor. + UpsertContext(const UpsertContext& other) + : key_{ other.key_ } + , val_{ other.val_ } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + inline static constexpr uint32_t value_size() { + return sizeof(value_t); + } + /// Non-atomic and atomic Put() methods. + inline void Put(Value& value) { + value.val_ = val_; + } + inline bool PutAtomic(Value& value) { + value.atomic_val_.store(val_); + return true; + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + uint32_t val_; + }; + + static auto upsert_callback = [](IAsyncContext* context, Status result) { + // Upserts don't go to disk. + ASSERT_TRUE(false); + }; + + std::experimental::filesystem::create_directories("storage"); + + static constexpr uint32_t kNumRecords = 200000; + static constexpr uint32_t kNumThreads = 16; + static constexpr uint32_t kNumRecordsPerThread = kNumRecords / kNumThreads; + + static Guid session_ids[kNumThreads]; + std::memset(session_ids, 0, sizeof(session_ids)); + + static std::atomic num_threads_persistent; + num_threads_persistent = 0; + static std::atomic threads_persistent[Thread::kMaxNumThreads]; + for(size_t idx = 0; idx < Thread::kMaxNumThreads; ++idx) { + threads_persistent[idx] = false; + } + + static std::atomic num_threads_started; + num_threads_started = 0; + + static auto persistence_callback = [](uint64_t persistent_serial_num) { + bool expected = false; + ASSERT_TRUE(threads_persistent[Thread::id()].compare_exchange_strong(expected, true)); + ++num_threads_persistent; + }; + + typedef FasterKv store_t; + + class ReadContext1 : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + ReadContext1(Key key, uint32_t expected_) + : key_{ key } + , val_{ 0 } + , expected{ expected_ } { + } + + /// Copy (and deep-copy) constructor. + ReadContext1(const ReadContext1& other) + : key_{ other.key_ } + , val_{ other.val_ } + , expected{ other.expected } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + + inline void Get(const Value& value) { + val_ = value.val_; + } + inline void GetAtomic(const Value& value) { + val_ = value.atomic_val_.load(); + } + + uint64_t val() const { + return val_; + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + uint32_t val_; + public: + const uint32_t expected; + }; + + { + // Populate and checkpoint the store. + + // 6 pages! + store_t store{ 8192, 201326592, "storage", 0.4 }; + + auto upsert_checkpoint_worker = [](store_t* store, uint32_t thread_id) { + assert(thread_id == 0); + session_ids[thread_id] = store->StartSession(); + ++num_threads_started; + + // upsert some records + for(uint32_t idx = kNumRecordsPerThread * thread_id; + idx < kNumRecordsPerThread * (thread_id + 1); ++idx) { + UpsertContext context{ Key{ idx }, idx + 7 }; + + Status result = store->Upsert(context, upsert_callback, 1); + ASSERT_EQ(Status::Ok, result); + + if(idx % 256 == 0) { + store->Refresh(); + } + } + + while(num_threads_started < kNumThreads) { + std::this_thread::yield(); + } + // checkpoint (transition from REST to INDEX_CHKPT) + ASSERT_TRUE(store->Checkpoint(persistence_callback)); + + // Ensure that the checkpoint completes. + while(num_threads_persistent < kNumThreads) { + store->CompletePending(false); + } + + bool result = store->CompletePending(true); + ASSERT_TRUE(result); + store->StopSession(); + }; + + auto upsert_worker = [](store_t* store, uint32_t thread_id) { + assert(thread_id != 0); + session_ids[thread_id] = store->StartSession(); + ++num_threads_started; + + // upsert some records + for(uint32_t idx = kNumRecordsPerThread * thread_id; + idx < kNumRecordsPerThread * (thread_id + 1); ++idx) { + UpsertContext context{ Key{ idx }, idx + 7 }; + Status result = store->Upsert(context, upsert_callback, 1); + ASSERT_EQ(Status::Ok, result); + + if(idx % 256 == 0) { + store->Refresh(); + } + } + + // Don't exit this session until the checkpoint has completed. + while(num_threads_persistent < kNumThreads) { + store->CompletePending(false); + } + + bool result = store->CompletePending(true); + ASSERT_TRUE(result); + store->StopSession(); + }; + + std::deque threads{}; + threads.emplace_back(upsert_checkpoint_worker, &store, 0); + for(uint32_t idx = 1; idx < kNumThreads; ++idx) { + threads.emplace_back(upsert_worker, &store, idx); + } + for(auto& thread : threads) { + thread.join(); + } + + // Verify the store. + store.StartSession(); + + for(uint32_t idx = 0; idx < kNumRecords; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + ASSERT_EQ(Status::Ok, result); + ASSERT_EQ(context->expected, context->val()); + }; + + ReadContext1 context{ Key{ idx }, idx + 7 }; + Status result = store.Read(context, callback, 1); + if(result != Status::Ok) { + ASSERT_EQ(Status::Pending, result); + } + } + + store.StopSession(); + } + + // Test recovery. + store_t new_store{ 8192, 201326592, "storage", 0.4 }; + + std::vector recovered_session_ids; + Status status = new_store.Recover(1, 1, recovered_session_ids); + ASSERT_EQ(recovered_session_ids.size(), kNumThreads); + ASSERT_EQ(Status::Ok, status); + + static std::atomic records_read; + records_read = 0; + + class ReadContext2 : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + ReadContext2(Key key, uint32_t expected_, uint32_t idx_, std::atomic* found_) + : key_{ key } + , val_{ 0 } + , expected{ expected_ } + , idx{ idx_ } + , found{ found_ } { + } + + /// Copy (and deep-copy) constructor. + ReadContext2(const ReadContext2& other) + : key_{ other.key_ } + , val_{ other.val_ } + , expected{ other.expected } + , idx{ other.idx } + , found{ other.found } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + + inline void Get(const Value& value) { + val_ = value.val_; + } + inline void GetAtomic(const Value& value) { + val_ = value.atomic_val_.load(); + } + + uint64_t val() const { + return val_; + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + uint32_t val_; + public: + const uint32_t expected; + const uint32_t idx; + std::atomic* found; + }; + + auto read_worker = [](store_t* store, uint32_t thread_id) { + uint64_t serial_num = store->ContinueSession(session_ids[thread_id]); + ASSERT_EQ(1, serial_num); + + std::unique_ptr> found{ new std::atomic[kNumRecordsPerThread] }; + std::memset(found.get(), 0, sizeof(found.get()[0]) * kNumRecordsPerThread); + + // verify records + auto callback = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + if(result == Status::Ok) { + ++records_read; + ASSERT_EQ(context->expected, context->val()); + bool expected = false; + ASSERT_TRUE(context->found[context->idx].compare_exchange_strong(expected, true)); + } else { + ASSERT_EQ(Status::NotFound, result); + ASSERT_FALSE(context->found[context->idx].load()); + } + }; + for(uint32_t idx = kNumRecordsPerThread * thread_id; + idx < kNumRecordsPerThread * (thread_id + 1); ++idx) { + ReadContext2 context{ Key{ idx }, idx + 7, idx - (kNumRecordsPerThread * thread_id), + found.get() }; + Status result = store->Read(context, callback, 1); + if(result == Status::Ok) { + ++records_read; + ASSERT_EQ(context.expected, context.val()); + bool expected = false; + ASSERT_TRUE(found.get()[context.idx].compare_exchange_strong(expected, true)); + } else { + ASSERT_TRUE(result == Status::Pending || result == Status::NotFound); + if(result == Status::NotFound) { + ASSERT_FALSE(found.get()[context.idx].load()); + } + } + + if(idx % 256 == 0) { + store->Refresh(); + store->CompletePending(false); + } + } + store->CompletePending(true); + store->StopSession(); + + bool found_all = true; + for(uint32_t idx = 0; idx < kNumRecordsPerThread; ++idx) { + if(found_all != found.get()[idx]) { + // Consistent-point recovery implies that after one record isn't found, all subsequent + // records will not be found. + Key key{ kNumRecordsPerThread* thread_id + idx }; + KeyHash hash = key.GetHash(); + std::string error; + error += "key = "; + error += std::to_string(kNumRecordsPerThread* thread_id + idx); + error += ", idx = "; + error += std::to_string(hash.idx(8192)); + error += ", tag = "; + error += std::to_string(hash.tag()); + ASSERT_TRUE(found_all) << error; + found_all = false; + } + } + }; + + std::deque threads{}; + for(uint32_t idx = 0; idx < kNumThreads; ++idx) { + threads.emplace_back(read_worker, &new_store, idx); + } + for(auto& thread : threads) { + thread.join(); + } + + ASSERT_GT(records_read, (uint32_t)0); + ASSERT_LE(records_read, kNumRecords); +} + +TEST(CLASS, Concurrent_Insert_Large) { + class Key { + public: + Key(uint32_t key) + : key_{ key } { + } + + inline static constexpr uint32_t size() { + return static_cast(sizeof(Key)); + } + inline KeyHash GetHash() const { + std::hash hash_fn{}; + return KeyHash{ hash_fn(key_) }; + } + + /// Comparison operators. + inline bool operator==(const Key& other) const { + return key_ == other.key_; + } + inline bool operator!=(const Key& other) const { + return key_ != other.key_; + } + + private: + uint32_t key_; + }; + static_assert(sizeof(Key) == 4, "sizeof(Key) != 4"); + static_assert(alignof(Key) == 4, "alignof(Key) != 4"); + + class UpsertContext; + class ReadContext1; + class ReadContext2; + + class Value { + public: + Value() + : val_{ 0 } { + } + + inline static constexpr uint32_t size() { + return static_cast(sizeof(Value)); + } + + friend class UpsertContext; + friend class ReadContext1; + friend class ReadContext2; + + private: + union { + std::atomic atomic_val_; + uint32_t val_; + }; + }; + static_assert(sizeof(Value) == 4, "sizeof(Value) != 4"); + static_assert(alignof(Value) == 4, "alignof(Value) != 4"); + + class UpsertContext : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + UpsertContext(const Key& key, uint32_t val) + : key_{ key } + , val_{ val } { + } + + /// Copy (and deep-copy) constructor. + UpsertContext(const UpsertContext& other) + : key_{ other.key_ } + , val_{ other.val_ } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + inline static constexpr uint32_t value_size() { + return sizeof(value_t); + } + /// Non-atomic and atomic Put() methods. + inline void Put(Value& value) { + value.val_ = val_; + } + inline bool PutAtomic(Value& value) { + value.atomic_val_.store(val_); + return true; + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + uint32_t val_; + }; + + static auto upsert_callback = [](IAsyncContext* context, Status result) { + // Upserts don't go to disk. + ASSERT_TRUE(false); + }; + + std::experimental::filesystem::create_directories("storage"); + + static constexpr uint32_t kNumRecords = 6000000; + static constexpr uint32_t kNumThreads = 16; + static constexpr uint32_t kNumRecordsPerThread = kNumRecords / kNumThreads; + + static Guid session_ids[kNumThreads]; + std::memset(session_ids, 0, sizeof(session_ids)); + + static std::atomic num_threads_persistent; + num_threads_persistent = 0; + static std::atomic threads_persistent[Thread::kMaxNumThreads]; + for(size_t idx = 0; idx < Thread::kMaxNumThreads; ++idx) { + threads_persistent[idx] = false; + } + + static std::atomic num_threads_started; + num_threads_started = 0; + + static auto persistence_callback = [](uint64_t persistent_serial_num) { + bool expected = false; + ASSERT_TRUE(threads_persistent[Thread::id()].compare_exchange_strong(expected, true)); + ++num_threads_persistent; + }; + + typedef FasterKv store_t; + + class ReadContext1 : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + ReadContext1(Key key, uint32_t expected_) + : key_{ key } + , val_{ 0 } + , expected{ expected_ } { + } + + /// Copy (and deep-copy) constructor. + ReadContext1(const ReadContext1& other) + : key_{ other.key_ } + , val_{ other.val_ } + , expected{ other.expected } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + + inline void Get(const Value& value) { + val_ = value.val_; + } + inline void GetAtomic(const Value& value) { + val_ = value.atomic_val_.load(); + } + + uint64_t val() const { + return val_; + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + uint32_t val_; + public: + const uint32_t expected; + }; + + { + // Populate and checkpoint the store. + + // 6 pages! + store_t store{ 524288, 201326592, "storage", 0.4 }; + + auto upsert_checkpoint_worker = [](store_t* store, uint32_t thread_id) { + assert(thread_id == 0); + session_ids[thread_id] = store->StartSession(); + ++num_threads_started; + + // upsert some records + for(uint32_t idx = kNumRecordsPerThread * thread_id; + idx < kNumRecordsPerThread * (thread_id + 1); ++idx) { + UpsertContext context{ Key{ idx }, idx + 7 }; + + Status result = store->Upsert(context, upsert_callback, 1); + ASSERT_EQ(Status::Ok, result); + + if(idx % 256 == 0) { + store->Refresh(); + } + } + + while(num_threads_started < kNumThreads) { + std::this_thread::yield(); + } + // checkpoint (transition from REST to INDEX_CHKPT) + ASSERT_TRUE(store->Checkpoint(persistence_callback)); + + // Ensure that the checkpoint completes. + while(num_threads_persistent < kNumThreads) { + store->CompletePending(false); + } + + bool result = store->CompletePending(true); + ASSERT_TRUE(result); + store->StopSession(); + }; + + auto upsert_worker = [](store_t* store, uint32_t thread_id) { + assert(thread_id != 0); + session_ids[thread_id] = store->StartSession(); + ++num_threads_started; + + // upsert some records + for(uint32_t idx = kNumRecordsPerThread * thread_id; + idx < kNumRecordsPerThread * (thread_id + 1); ++idx) { + UpsertContext context{ Key{ idx }, idx + 7 }; + Status result = store->Upsert(context, upsert_callback, 1); + ASSERT_EQ(Status::Ok, result); + + if(idx % 256 == 0) { + store->Refresh(); + } + } + + // Don't exit this session until the checkpoint has completed. + while(num_threads_persistent < kNumThreads) { + store->CompletePending(false); + } + + bool result = store->CompletePending(true); + ASSERT_TRUE(result); + store->StopSession(); + }; + + std::deque threads{}; + threads.emplace_back(upsert_checkpoint_worker, &store, 0); + for(uint32_t idx = 1; idx < kNumThreads; ++idx) { + threads.emplace_back(upsert_worker, &store, idx); + } + for(auto& thread : threads) { + thread.join(); + } + + // Verify the store. + store.StartSession(); + for(uint32_t idx = 0; idx < kNumRecords; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + ASSERT_EQ(Status::Ok, result); + ASSERT_EQ(context->expected, context->val()); + }; + + ReadContext1 context{ Key{ idx }, idx + 7 }; + Status result = store.Read(context, callback, 1); + if(result != Status::Ok) { + ASSERT_EQ(Status::Pending, result); + } + } + store.StopSession(); + } + + // Test recovery. + store_t new_store{ 524288, 201326592, "storage", 0.4 }; + + std::vector recovered_session_ids; + Status status = new_store.Recover(1, 1, recovered_session_ids); + ASSERT_EQ(recovered_session_ids.size(), kNumThreads); + ASSERT_EQ(Status::Ok, status); + + static std::atomic records_read; + records_read = 0; + + class ReadContext2 : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + ReadContext2(Key key, uint32_t expected_, uint32_t idx_, std::atomic* found_) + : key_{ key } + , val_{ 0 } + , expected{ expected_ } + , idx{ idx_ } + , found{ found_ } { + } + + /// Copy (and deep-copy) constructor. + ReadContext2(const ReadContext2& other) + : key_{ other.key_ } + , val_{ other.val_ } + , expected{ other.expected } + , idx{ other.idx } + , found{ other.found } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + + inline void Get(const Value& value) { + val_ = value.val_; + } + inline void GetAtomic(const Value& value) { + val_ = value.atomic_val_.load(); + } + + uint64_t val() const { + return val_; + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + uint32_t val_; + public: + const uint32_t expected; + const uint32_t idx; + std::atomic* found; + }; + + auto read_worker = [](store_t* store, uint32_t thread_id) { + uint64_t serial_num = store->ContinueSession(session_ids[thread_id]); + ASSERT_EQ(1, serial_num); + + std::unique_ptr> found{ new std::atomic[kNumRecordsPerThread] }; + std::memset(found.get(), 0, sizeof(found.get()[0]) * kNumRecordsPerThread); + + // verify records + auto callback = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + if(result == Status::Ok) { + ++records_read; + ASSERT_EQ(context->expected, context->val()); + bool expected = false; + ASSERT_TRUE(context->found[context->idx].compare_exchange_strong(expected, true)); + } else { + ASSERT_EQ(Status::NotFound, result); + ASSERT_FALSE(context->found[context->idx].load()); + } + }; + for(uint32_t idx = kNumRecordsPerThread * thread_id; + idx < kNumRecordsPerThread * (thread_id + 1); ++idx) { + ReadContext2 context{ Key{ idx }, idx + 7, idx - (kNumRecordsPerThread * thread_id), + found.get() }; + Status result = store->Read(context, callback, 1); + if(result == Status::Ok) { + ++records_read; + ASSERT_EQ(context.expected, context.val()); + bool expected = false; + ASSERT_TRUE(found.get()[context.idx].compare_exchange_strong(expected, true)); + } else { + ASSERT_TRUE(result == Status::Pending || result == Status::NotFound); + if(result == Status::NotFound) { + ASSERT_FALSE(found.get()[context.idx].load()); + } + } + + if(idx % 256 == 0) { + store->Refresh(); + store->CompletePending(false); + } + } + store->CompletePending(true); + store->StopSession(); + + bool found_all = true; + for(uint32_t idx = 0; idx < kNumRecordsPerThread; ++idx) { + if(found_all != found.get()[idx]) { + // Consistent-point recovery implies that after one record isn't found, all subsequent + // records will not be found. + Key key{ kNumRecordsPerThread* thread_id + idx }; + KeyHash hash = key.GetHash(); + std::string error; + error += "key = "; + error += std::to_string(kNumRecordsPerThread* thread_id + idx); + error += ", idx = "; + error += std::to_string(hash.idx(8192)); + error += ", tag = "; + error += std::to_string(hash.tag()); + ASSERT_TRUE(found_all) << error; + found_all = false; + } + } + }; + + std::deque threads{}; + for(uint32_t idx = 0; idx < kNumThreads; ++idx) { + threads.emplace_back(read_worker, &new_store, idx); + } + for(auto& thread : threads) { + thread.join(); + } + + ASSERT_GT(records_read, (uint32_t)0); + ASSERT_LE(records_read, kNumRecords); +} + +TEST(CLASS, Concurrent_Update_Small) { + class Key { + public: + Key(uint32_t key) + : key_{ key } { + } + inline static constexpr uint32_t size() { + return static_cast(sizeof(Key)); + } + inline KeyHash GetHash() const { + std::hash hash_fn{}; + return KeyHash{ hash_fn(key_) }; + } + + /// Comparison operators. + inline bool operator==(const Key& other) const { + return key_ == other.key_; + } + inline bool operator!=(const Key& other) const { + return key_ != other.key_; + } + + private: + uint32_t key_; + }; + static_assert(sizeof(Key) == 4, "sizeof(Key) != 4"); + static_assert(alignof(Key) == 4, "alignof(Key) != 4"); + + class UpsertContext; + class ReadContext1; + class ReadContext2; + + class Value { + public: + Value() + : val_{ 0 } { + } + + inline static constexpr uint32_t size() { + return static_cast(sizeof(Value)); + } + + friend class UpsertContext; + friend class ReadContext1; + friend class ReadContext2; + + private: + union { + std::atomic atomic_val_; + uint32_t val_; + }; + }; + static_assert(sizeof(Value) == 4, "sizeof(Value) != 4"); + static_assert(alignof(Value) == 4, "alignof(Value) != 4"); + + class UpsertContext : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + UpsertContext(const Key& key, uint32_t val) + : key_{ key } + , val_{ val } { + } + + /// Copy (and deep-copy) constructor. + UpsertContext(const UpsertContext& other) + : key_{ other.key_ } + , val_{ other.val_ } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + inline static constexpr uint32_t value_size() { + return sizeof(value_t); + } + /// Non-atomic and atomic Put() methods. + inline void Put(Value& value) { + value.val_ = val_; + } + inline bool PutAtomic(Value& value) { + value.atomic_val_.store(val_); + return true; + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + uint32_t val_; + }; + + static auto upsert_callback = [](IAsyncContext* context, Status result) { + // Upserts don't go to disk. + ASSERT_TRUE(false); + }; + + std::experimental::filesystem::create_directories("storage"); + + static constexpr uint32_t kNumRecords = 200000; + static constexpr uint32_t kNumThreads = 16; + static constexpr uint32_t kNumRecordsPerThread = kNumRecords / kNumThreads; + + static Guid session_ids[kNumThreads]; + std::memset(session_ids, 0, sizeof(session_ids)); + + static std::atomic num_threads_persistent; + num_threads_persistent = 0; + static std::atomic threads_persistent[Thread::kMaxNumThreads]; + for(size_t idx = 0; idx < Thread::kMaxNumThreads; ++idx) { + threads_persistent[idx] = false; + } + + static std::atomic num_threads_started; + num_threads_started = 0; + + static auto persistence_callback = [](uint64_t persistent_serial_num) { + bool expected = false; + ASSERT_TRUE(threads_persistent[Thread::id()].compare_exchange_strong(expected, + true)); + ++num_threads_persistent; + }; + + typedef FasterKv store_t; + + class ReadContext1 : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + ReadContext1(Key key, uint32_t expected_) + : key_{ key } + , val_{ 0 } + , expected{ expected_ } { + } + + /// Copy (and deep-copy) constructor. + ReadContext1(const ReadContext1& other) + : key_{ other.key_ } + , val_{ other.val_ } + , expected{ other.expected } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + + inline void Get(const Value& value) { + val_ = value.val_; + } + inline void GetAtomic(const Value& value) { + val_ = value.atomic_val_.load(); + } + + uint64_t val() const { + return val_; + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + uint32_t val_; + public: + const uint32_t expected; + }; + + { + // 6 pages! + store_t store{ 8192, 201326592, "storage", 0.4 }; + + // Populate the store. + store.StartSession(); + for(uint32_t idx = 0; idx < kNumRecords; ++idx) { + UpsertContext context{ Key{ idx }, 999 }; + Status result = store.Upsert(context, upsert_callback, 1); + ASSERT_EQ(Status::Ok, result); + if(idx % 256 == 0) { + store.Refresh(); + store.CompletePending(false); + } + } + store.StopSession(); + + /// Update and checkpoint the store. + auto upsert_checkpoint_worker = [](store_t* store, uint32_t thread_id) { + assert(thread_id == 0); + session_ids[thread_id] = store->StartSession(); + ++num_threads_started; + + // update some records + for(uint32_t idx = kNumRecordsPerThread * thread_id; + idx < kNumRecordsPerThread * (thread_id + 1); ++idx) { + UpsertContext context{ Key{ idx }, idx + 1 }; + + Status result = store->Upsert(context, upsert_callback, idx + 1); + ASSERT_EQ(Status::Ok, result); + + if(idx % 256 == 0) { + store->Refresh(); + } + } + + while(num_threads_started < kNumThreads) { + std::this_thread::yield(); + } + // checkpoint (transition from REST to INDEX_CHKPT) + ASSERT_TRUE(store->Checkpoint(persistence_callback)); + + // Ensure that the checkpoint completes. + while(num_threads_persistent < kNumThreads) { + store->CompletePending(false); + } + + bool result = store->CompletePending(true); + ASSERT_TRUE(result); + store->StopSession(); + }; + + auto upsert_worker = [](store_t* store, uint32_t thread_id) { + assert(thread_id != 0); + session_ids[thread_id] = store->StartSession(); + ++num_threads_started; + + // update some records + for(uint32_t idx = kNumRecordsPerThread * thread_id; + idx < kNumRecordsPerThread * (thread_id + 1); ++idx) { + UpsertContext context{ Key{ idx }, idx + 1 }; + Status result = store->Upsert(context, upsert_callback, idx + 1); + ASSERT_EQ(Status::Ok, result); + + if(idx % 256 == 0) { + store->Refresh(); + } + } + + // Don't exit this session until the checkpoint has completed. + while(num_threads_persistent < kNumThreads) { + store->CompletePending(false); + } + + bool result = store->CompletePending(true); + ASSERT_TRUE(result); + store->StopSession(); + }; + + std::deque threads{}; + threads.emplace_back(upsert_checkpoint_worker, &store, 0); + for(uint32_t idx = 1; idx < kNumThreads; ++idx) { + threads.emplace_back(upsert_worker, &store, idx); + } + for(auto& thread : threads) { + thread.join(); + } + + // Verify the store. + store.StartSession(); + for(uint32_t idx = 0; idx < kNumRecords; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + ASSERT_EQ(Status::Ok, result); + ASSERT_EQ(context->expected, context->val()); + }; + + ReadContext1 context{ Key{ idx }, idx + 1 }; + Status result = store.Read(context, callback, 1); + if(result != Status::Ok) { + ASSERT_EQ(Status::Pending, result); + } + } + store.StopSession(); + } + + // Test recovery. + store_t new_store{ 8192, 201326592, "storage", 0.4 }; + + std::vector recovered_session_ids; + Status status = new_store.Recover(1, 1, recovered_session_ids); + ASSERT_EQ(recovered_session_ids.size(), kNumThreads); + ASSERT_EQ(Status::Ok, status); + + static std::atomic records_read; + records_read = 0; + + class ReadContext2 : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + ReadContext2(Key key, uint32_t expected_, uint32_t idx_, std::atomic* found_) + : key_{ key } + , val_{ 0 } + , expected{ expected_ } + , idx{ idx_ } + , found{ found_ } { + } + + /// Copy (and deep-copy) constructor. + ReadContext2(const ReadContext2& other) + : key_{ other.key_ } + , val_{ other.val_ } + , expected{ other.expected } + , idx{ other.idx } + , found{ other.found } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + + inline void Get(const Value& value) { + val_ = value.val_; + } + inline void GetAtomic(const Value& value) { + val_ = value.atomic_val_.load(); + } + + uint64_t val() const { + return val_; + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + uint32_t val_; + public: + const uint32_t expected; + const uint32_t idx; + std::atomic* found; + }; + + auto read_worker = [](store_t* store, uint32_t thread_id) { + uint64_t serial_num = store->ContinueSession(session_ids[thread_id]); + ASSERT_GE(serial_num, 1); + + std::unique_ptr> found{ new std::atomic[kNumRecordsPerThread] }; + std::memset(found.get(), 0, sizeof(found.get()[0]) * kNumRecordsPerThread); + + // verify records + auto callback = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + ASSERT_EQ(Status::Ok, result); + if(context->expected == context->val()) { + bool expected = false; + ASSERT_TRUE(context->found[context->idx].compare_exchange_strong(expected, true)); + } else { + ASSERT_EQ(999, context->val()); + bool expected = false; + ASSERT_FALSE(context->found[context->idx].load()); + } + }; + for(uint32_t idx = kNumRecordsPerThread * thread_id; + idx < kNumRecordsPerThread * (thread_id + 1); ++idx) { + ReadContext2 context{ Key{ idx }, idx + 1, idx - (kNumRecordsPerThread * thread_id), + found.get() }; + Status result = store->Read(context, callback, 1); + if(result == Status::Ok) { + ++records_read; + if(context.expected == context.val()) { + bool expected = false; + ASSERT_TRUE(found.get()[context.idx].compare_exchange_strong(expected, true)); + } else { + ASSERT_EQ(999, context.val()); + bool expected = false; + ASSERT_FALSE(found.get()[context.idx].load()); + } + } else { + ASSERT_EQ(Status::Pending, result); + } + if(idx % 256 == 0) { + store->Refresh(); + store->CompletePending(false); + } + } + store->CompletePending(true); + store->StopSession(); + + bool found_all = true; + for(uint32_t idx = 0; idx < kNumRecordsPerThread; ++idx) { + if(found_all != found.get()[idx]) { + // Consistent-point recovery implies that after one record isn't found, all subsequent + // records will not be found. + Key key{ kNumRecordsPerThread* thread_id + idx }; + KeyHash hash = key.GetHash(); + std::string error; + error += "key = "; + error += std::to_string(kNumRecordsPerThread* thread_id + idx); + error += ", idx = "; + error += std::to_string(hash.idx(8192)); + error += ", tag = "; + error += std::to_string(hash.tag()); + ASSERT_TRUE(found_all) << error; + found_all = false; + } + } + }; + + std::deque threads{}; + for(uint32_t idx = 0; idx < kNumThreads; ++idx) { + threads.emplace_back(read_worker, &new_store, idx); + } + for(auto& thread : threads) { + thread.join(); + } + + ASSERT_GT(records_read, (uint32_t)0); + ASSERT_LE(records_read, kNumRecords); +} + +TEST(CLASS, Concurrent_Update_Large) { + class Key { + public: + Key(uint32_t key) + : key_{ key } { + } + + inline static constexpr uint32_t size() { + return static_cast(sizeof(Key)); + } + inline KeyHash GetHash() const { + std::hash hash_fn{}; + return KeyHash{ hash_fn(key_) }; + } + + /// Comparison operators. + inline bool operator==(const Key& other) const { + return key_ == other.key_; + } + inline bool operator!=(const Key& other) const { + return key_ != other.key_; + } + + private: + uint32_t key_; + }; + static_assert(sizeof(Key) == 4, "sizeof(Key) != 4"); + static_assert(alignof(Key) == 4, "alignof(Key) != 4"); + + class UpsertContext; + class ReadContext1; + class ReadContext2; + + class Value { + public: + Value() + : val_{ 0 } { + } + + inline static constexpr uint32_t size() { + return static_cast(sizeof(Value)); + } + + friend class UpsertContext; + friend class ReadContext1; + friend class ReadContext2; + + private: + union { + std::atomic atomic_val_; + uint32_t val_; + }; + }; + static_assert(sizeof(Value) == 4, "sizeof(Value) != 4"); + static_assert(alignof(Value) == 4, "alignof(Value) != 4"); + + class UpsertContext : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + UpsertContext(const Key& key, uint32_t val) + : key_{ key } + , val_{ val } { + } + + /// Copy (and deep-copy) constructor. + UpsertContext(const UpsertContext& other) + : key_{ other.key_ } + , val_{ other.val_ } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + inline static constexpr uint32_t value_size() { + return sizeof(value_t); + } + /// Non-atomic and atomic Put() methods. + inline void Put(Value& value) { + value.val_ = val_; + } + inline bool PutAtomic(Value& value) { + value.atomic_val_.store(val_); + return true; + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + uint32_t val_; + }; + + static auto upsert_callback = [](IAsyncContext* context, Status result) { + // Upserts don't go to disk. + ASSERT_TRUE(false); + }; + + std::experimental::filesystem::create_directories("storage"); + + static constexpr uint32_t kNumRecords = 10000000; + static constexpr uint32_t kNumThreads = 16; + static constexpr uint32_t kNumRecordsPerThread = kNumRecords / kNumThreads; + + static Guid session_ids[kNumThreads]; + std::memset(session_ids, 0, sizeof(session_ids)); + + static std::atomic num_threads_persistent; + num_threads_persistent = 0; + static std::atomic threads_persistent[Thread::kMaxNumThreads]; + for(size_t idx = 0; idx < Thread::kMaxNumThreads; ++idx) { + threads_persistent[idx] = false; + } + + static std::atomic num_threads_started; + num_threads_started = 0; + + static auto persistence_callback = [](uint64_t persistent_serial_num) { + bool expected = false; + ASSERT_TRUE(threads_persistent[Thread::id()].compare_exchange_strong(expected, true)); + ++num_threads_persistent; + }; + + typedef FasterKv store_t; + + class ReadContext1 : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + ReadContext1(Key key, uint32_t expected_) + : key_{ key } + , val_{ 0 } + , expected{ expected_ } { + } + + /// Copy (and deep-copy) constructor. + ReadContext1(const ReadContext1& other) + : key_{ other.key_ } + , val_{ other.val_ } + , expected{ other.expected } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + + inline void Get(const Value& value) { + val_ = value.val_; + } + inline void GetAtomic(const Value& value) { + val_ = value.atomic_val_.load(); + } + + uint64_t val() const { + return val_; + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + uint32_t val_; + public: + const uint32_t expected; + }; + + { + // 6 pages! + store_t store{ 524288, 201326592, "storage", 0.4 }; + + // Populate the store. + store.StartSession(); + for(uint32_t idx = 0; idx < kNumRecords; ++idx) { + UpsertContext context{ Key{ idx }, 999 }; + Status result = store.Upsert(context, upsert_callback, 1); + ASSERT_EQ(Status::Ok, result); + if(idx % 256 == 0) { + store.Refresh(); + store.CompletePending(false); + } + } + + // Truncate some old copies of records that we no longer need. + static std::atomic truncated; + truncated = false; + static std::atomic complete; + complete = false; + auto truncate_callback = [](uint64_t offset) { + truncated = true; + }; + auto complete_callback = []() { + complete = true; + }; + ASSERT_TRUE(store.ShiftBeginAddress(Address{ 33554432L }, truncate_callback, + complete_callback)); + while(!truncated || !complete) { + store.CompletePending(false); + } + store.StopSession(); + + /// Update and checkpoint the store. + auto upsert_checkpoint_worker = [](store_t* store, uint32_t thread_id) { + assert(thread_id == 0); + session_ids[thread_id] = store->StartSession(); + ++num_threads_started; + + // update some records + for(uint32_t idx = kNumRecordsPerThread * thread_id; + idx < kNumRecordsPerThread * (thread_id + 1); ++idx) { + UpsertContext context{ Key{ idx }, idx + 1 }; + + Status result = store->Upsert(context, upsert_callback, idx + 1); + ASSERT_EQ(Status::Ok, result); + + if(idx % 256 == 0) { + store->Refresh(); + } + } + + while(num_threads_started < kNumThreads) { + std::this_thread::yield(); + } + // checkpoint (transition from REST to INDEX_CHKPT) + ASSERT_TRUE(store->Checkpoint(persistence_callback)); + + // Ensure that the checkpoint completes. + while(num_threads_persistent < kNumThreads) { + store->CompletePending(false); + } + + bool result = store->CompletePending(true); + ASSERT_TRUE(result); + store->StopSession(); + }; + + auto upsert_worker = [](store_t* store, uint32_t thread_id) { + assert(thread_id != 0); + session_ids[thread_id] = store->StartSession(); + ++num_threads_started; + + // update some records + for(uint32_t idx = kNumRecordsPerThread * thread_id; + idx < kNumRecordsPerThread * (thread_id + 1); ++idx) { + UpsertContext context{ Key{ idx }, idx + 1 }; + Status result = store->Upsert(context, upsert_callback, idx + 1); + ASSERT_EQ(Status::Ok, result); + + if(idx % 256 == 0) { + store->Refresh(); + } + } + + // Don't exit this session until the checkpoint has completed. + while(num_threads_persistent < kNumThreads) { + store->CompletePending(false); + } + + bool result = store->CompletePending(true); + ASSERT_TRUE(result); + store->StopSession(); + }; + + std::deque threads{}; + threads.emplace_back(upsert_checkpoint_worker, &store, 0); + for(uint32_t idx = 1; idx < kNumThreads; ++idx) { + threads.emplace_back(upsert_worker, &store, idx); + } + for(auto& thread : threads) { + thread.join(); + } + + // Verify the store. + store.StartSession(); + for(uint32_t idx = 0; idx < kNumRecords; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + ASSERT_EQ(Status::Ok, result); + ASSERT_EQ(context->expected, context->val()); + }; + + ReadContext1 context{ Key{ idx }, idx + 1 }; + Status result = store.Read(context, callback, 1); + if(result != Status::Ok) { + ASSERT_EQ(Status::Pending, result); + } + if(idx % 256 == 0) { + store.Refresh(); + store.CompletePending(false); + } + } + + bool result = store.CompletePending(true); + ASSERT_TRUE(result); + store.StopSession(); + } + + // Test recovery. + store_t new_store{ 524288, 201326592, "storage", 0.4 }; + + std::vector recovered_session_ids; + Status status = new_store.Recover(1, 1, recovered_session_ids); + ASSERT_EQ(recovered_session_ids.size(), kNumThreads); + ASSERT_EQ(Status::Ok, status); + + static std::atomic records_read; + records_read = 0; + + class ReadContext2 : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + ReadContext2(Key key, uint32_t expected_, uint32_t idx_, std::atomic* found_) + : key_{ key } + , val_{ 0 } + , expected{ expected_ } + , idx{ idx_ } + , found{ found_ } { + } + + /// Copy (and deep-copy) constructor. + ReadContext2(const ReadContext2& other) + : key_{ other.key_ } + , val_{ other.val_ } + , expected{ other.expected } + , idx{ other.idx } + , found{ other.found } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + + inline void Get(const Value& value) { + val_ = value.val_; + } + inline void GetAtomic(const Value& value) { + val_ = value.atomic_val_.load(); + } + + uint64_t val() const { + return val_; + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + uint32_t val_; + public: + const uint32_t expected; + const uint32_t idx; + std::atomic* found; + }; + + auto read_worker = [](store_t* store, uint32_t thread_id) { + uint64_t serial_num = store->ContinueSession(session_ids[thread_id]); + ASSERT_GE(serial_num, 1); + + std::unique_ptr> found{ new std::atomic[kNumRecordsPerThread] }; + std::memset(found.get(), 0, sizeof(found.get()[0]) * kNumRecordsPerThread); + + // verify records + auto callback = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + ASSERT_EQ(Status::Ok, result); + if(context->expected == context->val()) { + bool expected = false; + ASSERT_TRUE(context->found[context->idx].compare_exchange_strong(expected, true)); + } else { + ASSERT_EQ(999, context->val()); + bool expected = false; + ASSERT_FALSE(context->found[context->idx].load()); + } + }; + for(uint32_t idx = kNumRecordsPerThread * thread_id; + idx < kNumRecordsPerThread * (thread_id + 1); ++idx) { + ReadContext2 context{ Key{ idx }, idx + 1, idx - (kNumRecordsPerThread * thread_id), + found.get() }; + Status result = store->Read(context, callback, 1); + if(result == Status::Ok) { + ++records_read; + if(context.expected == context.val()) { + bool expected = false; + ASSERT_TRUE(found.get()[context.idx].compare_exchange_strong(expected, true)); + } else { + ASSERT_EQ(999, context.val()); + bool expected = false; + ASSERT_FALSE(found.get()[context.idx].load()); + } + } else { + ASSERT_EQ(Status::Pending, result); + } + if(idx % 256 == 0) { + store->Refresh(); + store->CompletePending(false); + } + } + store->CompletePending(true); + store->StopSession(); + + bool found_all = true; + for(uint32_t idx = 0; idx < kNumRecordsPerThread; ++idx) { + if(found_all != found.get()[idx]) { + // Consistent-point recovery implies that after one record isn't found, all subsequent + // records will not be found. + Key key{ kNumRecordsPerThread* thread_id + idx }; + KeyHash hash = key.GetHash(); + std::string error; + error += "key = "; + error += std::to_string(kNumRecordsPerThread* thread_id + idx); + error += ", idx = "; + error += std::to_string(hash.idx(8192)); + error += ", tag = "; + error += std::to_string(hash.tag()); + ASSERT_TRUE(found_all) << error; + found_all = false; + } + } + }; + + std::deque threads{}; + for(uint32_t idx = 0; idx < kNumThreads; ++idx) { + threads.emplace_back(read_worker, &new_store, idx); + } + for(auto& thread : threads) { + thread.join(); + } + + ASSERT_GT(records_read, (uint32_t)0); + ASSERT_LE(records_read, kNumRecords); +} + +TEST(CLASS, Concurrent_Rmw_Small) { + class RmwContext; + + class Key { + public: + Key(uint32_t key) + : key_{ key } { + } + + inline static constexpr uint32_t size() { + return static_cast(sizeof(Key)); + } + inline KeyHash GetHash() const { + std::hash hash_fn{}; + return KeyHash{ hash_fn(key_) }; + } + + /// Comparison operators. + inline bool operator==(const Key& other) const { + return key_ == other.key_; + } + inline bool operator!=(const Key& other) const { + return key_ != other.key_; + } + + friend class RmwContext; + + private: + uint32_t key_; + }; + static_assert(sizeof(Key) == 4, "sizeof(Key) != 4"); + static_assert(alignof(Key) == 4, "alignof(Key) != 4"); + + class ReadContext1; + class ReadContext2; + + class Value { + public: + Value() + : val_{ 0 } { + } + + inline static constexpr uint32_t size() { + return static_cast(sizeof(Value)); + } + + friend class RmwContext; + friend class ReadContext1; + friend class ReadContext2; + + private: + union { + std::atomic atomic_val_; + uint32_t val_; + }; + }; + static_assert(sizeof(Value) == 4, "sizeof(Value) != 4"); + static_assert(alignof(Value) == 4, "alignof(Value) != 4"); + + class RmwContext : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + RmwContext(const Key& key, uint32_t delta) + : key_{ key } + , delta_{ delta } { + } + + /// Copy (and deep-copy) constructor. + RmwContext(const RmwContext& other) + : key_{ other.key_ } + , delta_{ other.delta_ } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + inline static constexpr uint32_t value_size() { + return sizeof(value_t); + } + /// Non-atomic and atomic Put() methods. + inline void RmwInitial(Value& value) { + value.val_ = key_.key_; + } + inline void RmwCopy(const value_t& old_value, value_t& value) { + value.val_ = old_value.val_ + delta_; + } + inline bool RmwAtomic(value_t& value) { + value.atomic_val_ += delta_; + return true; + } + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + uint32_t delta_; + }; + + std::experimental::filesystem::create_directories("storage"); + + static constexpr uint32_t kNumRecords = 200000; + static constexpr uint32_t kNumThreads = 16; + static constexpr uint32_t kNumRecordsPerThread = kNumRecords / kNumThreads; + + static Guid session_ids[kNumThreads]; + std::memset(session_ids, 0, sizeof(session_ids)); + + static std::atomic num_threads_persistent; + num_threads_persistent = 0; + static std::atomic threads_persistent[Thread::kMaxNumThreads] = {}; + for(size_t idx = 0; idx < Thread::kMaxNumThreads; ++idx) { + threads_persistent[idx] = false; + } + + static std::atomic num_threads_started; + num_threads_started = 0; + + static auto persistence_callback = [](uint64_t persistent_serial_num) { + bool expected = false; + ASSERT_TRUE(threads_persistent[Thread::id()].compare_exchange_strong(expected, true)); + ++num_threads_persistent; + }; + + typedef FasterKv store_t; + + class ReadContext1 : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + ReadContext1(Key key, uint32_t expected_) + : key_{ key } + , val_{ 0 } + , expected{ expected_ } { + } + + /// Copy (and deep-copy) constructor. + ReadContext1(const ReadContext1& other) + : key_{ other.key_ } + , val_{ other.val_ } + , expected{ other.expected } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + + inline void Get(const Value& value) { + val_ = value.val_; + } + inline void GetAtomic(const Value& value) { + val_ = value.atomic_val_.load(); + } + + uint64_t val() const { + return val_; + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + uint32_t val_; + public: + const uint32_t expected; + }; + + { + // 6 pages! + store_t store{ 8192, 402653184, "storage", 0.4 }; + + // Populate the store. + store.StartSession(); + for(uint32_t idx = 0; idx < kNumRecords; ++idx) { + auto callback = [](IAsyncContext* context, Status result) { + ASSERT_EQ(Status::Ok, result); + }; + + RmwContext context{ Key{ idx }, 230 }; + Status result = store.Rmw(context, callback, 1); + ASSERT_EQ(Status::Ok, result); + if(idx % 256 == 0) { + store.Refresh(); + store.CompletePending(false); + } + } + store.StopSession(); + + /// Read-modify-write and checkpoint the store. + auto rmw_checkpoint_worker = [](store_t* store, uint32_t thread_id) { + assert(thread_id == 0); + session_ids[thread_id] = store->StartSession(); + ++num_threads_started; + + // read-modify-write some records + for(uint32_t idx = kNumRecordsPerThread * thread_id; + idx < kNumRecordsPerThread * (thread_id + 1); ++idx) { + auto callback = [](IAsyncContext* context, Status result) { + ASSERT_EQ(Status::Ok, result); + }; + RmwContext context{ Key{ idx }, 230 }; + Status result = store->Rmw(context, callback, idx + 1); + ASSERT_EQ(Status::Ok, result); + + if(idx % 256 == 0) { + store->Refresh(); + store->CompletePending(false); + } + } + + while(num_threads_started < kNumThreads) { + std::this_thread::yield(); + } + // checkpoint (transition from REST to INDEX_CHKPT) + ASSERT_TRUE(store->Checkpoint(persistence_callback)); + + // Ensure that the checkpoint completes. + while(num_threads_persistent < kNumThreads) { + store->CompletePending(false); + } + + bool result = store->CompletePending(true); + ASSERT_TRUE(result); + store->StopSession(); + }; + + auto rmw_worker = [](store_t* store, uint32_t thread_id) { + assert(thread_id != 0); + session_ids[thread_id] = store->StartSession(); + ++num_threads_started; + + // update some records + for(uint32_t idx = kNumRecordsPerThread * thread_id; + idx < kNumRecordsPerThread * (thread_id + 1); ++idx) { + auto callback = [](IAsyncContext* context, Status result) { + ASSERT_EQ(Status::Ok, result); + }; + RmwContext context{ Key{ idx }, 230 }; + Status result = store->Rmw(context, callback, idx + 1); + ASSERT_EQ(Status::Ok, result); + + if(idx % 256 == 0) { + store->Refresh(); + store->CompletePending(false); + } + } + + // Don't exit this session until the checkpoint has completed. + while(num_threads_persistent < kNumThreads) { + store->CompletePending(false); + } + + bool result = store->CompletePending(true); + ASSERT_TRUE(result); + store->StopSession(); + }; + + std::deque threads{}; + threads.emplace_back(rmw_checkpoint_worker, &store, 0); + for(uint32_t idx = 1; idx < kNumThreads; ++idx) { + threads.emplace_back(rmw_worker, &store, idx); + } + for(auto& thread : threads) { + thread.join(); + } + + // Verify the store. + store.StartSession(); + for(uint32_t idx = 0; idx < kNumRecords; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + ASSERT_EQ(Status::Ok, result); + ASSERT_EQ(context->expected, context->val()); + }; + + ReadContext1 context{ Key{ idx }, idx + 230 }; + Status result = store.Read(context, callback, 1); + if(result != Status::Ok) { + ASSERT_EQ(Status::Pending, result); + } + } + store.StopSession(); + } + + // Test recovery. + store_t new_store{ 8192, 402653184, "storage", 0.4 }; + + std::vector recovered_session_ids; + Status status = new_store.Recover(1, 1, recovered_session_ids); + ASSERT_EQ(recovered_session_ids.size(), kNumThreads); + ASSERT_EQ(Status::Ok, status); + + static std::atomic records_read; + records_read = 0; + + class ReadContext2 : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + ReadContext2(Key key, uint32_t expected_, uint32_t idx_, std::atomic* found_) + : key_{ key } + , val_{ 0 } + , expected{ expected_ } + , idx{ idx_ } + , found{ found_ } { + } + + /// Copy (and deep-copy) constructor. + ReadContext2(const ReadContext2& other) + : key_{ other.key_ } + , val_{ other.val_ } + , expected{ other.expected } + , idx{ other.idx } + , found{ other.found } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + + inline void Get(const Value& value) { + val_ = value.val_; + } + inline void GetAtomic(const Value& value) { + val_ = value.atomic_val_.load(); + } + + uint64_t val() const { + return val_; + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + uint32_t val_; + public: + const uint32_t expected; + const uint32_t idx; + std::atomic* found; + }; + + auto read_worker = [](store_t* store, uint32_t thread_id) { + uint64_t serial_num = store->ContinueSession(session_ids[thread_id]); + ASSERT_GE(serial_num, 1); + + std::unique_ptr> found{ new std::atomic[kNumRecordsPerThread] }; + std::memset(found.get(), 0, sizeof(found.get()[0]) * kNumRecordsPerThread); + + // verify records + auto callback = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + ASSERT_EQ(Status::Ok, result); + if(context->expected == context->val()) { + bool expected = false; + ASSERT_TRUE(context->found[context->idx].compare_exchange_strong(expected, true)); + } else { + ASSERT_EQ(context->expected - 230, context->val()); + bool expected = false; + ASSERT_FALSE(context->found[context->idx].load()); + } + }; + for(uint32_t idx = kNumRecordsPerThread * thread_id; + idx < kNumRecordsPerThread * (thread_id + 1); ++idx) { + ReadContext2 context{ Key{ idx }, idx + 230, idx - (kNumRecordsPerThread * thread_id), + found.get() }; + Status result = store->Read(context, callback, 1); + if(result == Status::Ok) { + ++records_read; + if(context.expected == context.val()) { + bool expected = false; + ASSERT_TRUE(found.get()[context.idx].compare_exchange_strong(expected, true)); + } else { + ASSERT_EQ(idx, context.val()); + bool expected = false; + ASSERT_FALSE(found.get()[context.idx].load()); + } + } else { + ASSERT_EQ(Status::Pending, result); + } + if(idx % 256 == 0) { + store->Refresh(); + store->CompletePending(false); + } + } + store->CompletePending(true); + store->StopSession(); + + bool found_all = true; + for(uint32_t idx = 0; idx < kNumRecordsPerThread; ++idx) { + if(found_all != found.get()[idx]) { + // Consistent-point recovery implies that after one record isn't found, all subsequent + // records will not be found. + Key key{ kNumRecordsPerThread* thread_id + idx }; + KeyHash hash = key.GetHash(); + std::string error; + error += "key = "; + error += std::to_string(kNumRecordsPerThread* thread_id + idx); + error += ", idx = "; + error += std::to_string(hash.idx(8192)); + error += ", tag = "; + error += std::to_string(hash.tag()); + ASSERT_TRUE(found_all) << error; + found_all = false; + } + } + }; + + std::deque threads{}; + for(uint32_t idx = 0; idx < kNumThreads; ++idx) { + threads.emplace_back(read_worker, &new_store, idx); + } + for(auto& thread : threads) { + thread.join(); + } + + ASSERT_GT(records_read, (uint32_t)0); + ASSERT_LE(records_read, kNumRecords); +} + +TEST(CLASS, Concurrent_Rmw_Large) { + class RmwContext; + + class Key { + public: + Key(uint32_t key) + : key_{ key } { + } + + inline static constexpr uint32_t size() { + return static_cast(sizeof(Key)); + } + inline KeyHash GetHash() const { + std::hash hash_fn{}; + return KeyHash{ hash_fn(key_) }; + } + + /// Comparison operators. + inline bool operator==(const Key& other) const { + return key_ == other.key_; + } + inline bool operator!=(const Key& other) const { + return key_ != other.key_; + } + + friend class RmwContext; + + private: + uint32_t key_; + }; + static_assert(sizeof(Key) == 4, "sizeof(Key) != 4"); + static_assert(alignof(Key) == 4, "alignof(Key) != 4"); + + class ReadContext1; + class ReadContext2; + + class Value { + public: + Value() + : val_{ 0 } { + } + + inline static constexpr uint32_t size() { + return static_cast(sizeof(Value)); + } + + friend class RmwContext; + friend class ReadContext1; + friend class ReadContext2; + + private: + union { + std::atomic atomic_val_; + uint32_t val_; + }; + }; + static_assert(sizeof(Value) == 4, "sizeof(Value) != 4"); + static_assert(alignof(Value) == 4, "alignof(Value) != 4"); + + class RmwContext : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + RmwContext(const Key& key, uint32_t delta) + : key_{ key } + , delta_{ delta } { + } + + /// Copy (and deep-copy) constructor. + RmwContext(const RmwContext& other) + : key_{ other.key_ } + , delta_{ other.delta_ } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + inline static constexpr uint32_t value_size() { + return sizeof(value_t); + } + /// Non-atomic and atomic Put() methods. + inline void RmwInitial(Value& value) { + value.val_ = key_.key_; + } + inline void RmwCopy(const value_t& old_value, value_t& value) { + value.val_ = old_value.val_ + delta_; + } + inline bool RmwAtomic(value_t& value) { + value.atomic_val_ += delta_; + return true; + } + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + uint32_t delta_; + }; + + std::experimental::filesystem::create_directories("storage"); + + static constexpr uint32_t kNumRecords = 6000000; + static constexpr uint32_t kNumThreads = 16; + static_assert(kNumRecords % kNumThreads == 0, "kNumRecords % kNumThreads != 0"); + static constexpr uint32_t kNumRecordsPerThread = kNumRecords / kNumThreads; + + static Guid session_ids[kNumThreads]; + std::memset(session_ids, 0, sizeof(session_ids)); + + static std::atomic num_threads_persistent; + num_threads_persistent = 0; + static std::atomic threads_persistent[Thread::kMaxNumThreads]; + for(size_t idx = 0; idx < Thread::kMaxNumThreads; ++idx) { + threads_persistent[idx] = false; + } + + static std::atomic num_threads_started; + num_threads_started = 0; + + static auto persistence_callback = [](uint64_t persistent_serial_num) { + bool expected = false; + ASSERT_TRUE(threads_persistent[Thread::id()].compare_exchange_strong(expected, true)); + ++num_threads_persistent; + }; + + typedef FasterKv store_t; + + class ReadContext1 : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + ReadContext1(Key key, uint32_t expected_) + : key_{ key } + , val_{ 0 } + , expected{ expected_ } { + } + + /// Copy (and deep-copy) constructor. + ReadContext1(const ReadContext1& other) + : key_{ other.key_ } + , val_{ other.val_ } + , expected{ other.expected } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + + inline void Get(const Value& value) { + val_ = value.val_; + } + inline void GetAtomic(const Value& value) { + val_ = value.atomic_val_.load(); + } + + uint64_t val() const { + return val_; + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + uint32_t val_; + public: + const uint32_t expected; + }; + + { + // 6 pages! + store_t store{ 524288, 402653184, "storage", 0.4 }; + + // Populate the store. + auto populate_worker0 = [](store_t* store, uint32_t thread_id) { + store->StartSession(); + auto callback = [](IAsyncContext* context, Status result) { + ASSERT_EQ(Status::Ok, result); + }; + for(uint32_t idx = kNumRecordsPerThread * thread_id; + idx < kNumRecordsPerThread * (thread_id + 1); ++idx) { + RmwContext context{ Key{ idx }, 230 }; + Status result = store->Rmw(context, callback, 1); + ASSERT_EQ(Status::Ok, result); + if(idx % 256 == 0) { + store->Refresh(); + store->CompletePending(false); + } + } + store->GrowIndex(nullptr); + store->StopSession(); + }; + auto populate_worker = [](store_t* store, uint32_t thread_id) { + store->StartSession(); + auto callback = [](IAsyncContext* context, Status result) { + ASSERT_EQ(Status::Ok, result); + }; + for(uint32_t idx = kNumRecordsPerThread * thread_id; + idx < kNumRecordsPerThread * (thread_id + 1); ++idx) { + RmwContext context{ Key{ idx }, 230 }; + Status result = store->Rmw(context, callback, 1); + ASSERT_EQ(Status::Ok, result); + if(idx % 256 == 0) { + store->Refresh(); + store->CompletePending(false); + } + } + store->StopSession(); + }; + + std::deque threads{}; + threads.emplace_back(populate_worker0, &store, 0); + for(uint32_t idx = 1; idx < kNumThreads; ++idx) { + threads.emplace_back(populate_worker, &store, idx); + } + for(auto& thread : threads) { + thread.join(); + } + + /// Read-modify-write and checkpoint the store. + auto rmw_checkpoint_worker = [](store_t* store, uint32_t thread_id) { + assert(thread_id == 0); + session_ids[thread_id] = store->StartSession(); + ++num_threads_started; + + // read-modify-write some records + for(uint32_t idx = kNumRecordsPerThread * thread_id; + idx < kNumRecordsPerThread * (thread_id + 1); ++idx) { + auto callback = [](IAsyncContext* context, Status result) { + ASSERT_EQ(Status::Ok, result); + }; + RmwContext context{ Key{ idx }, 230 }; + Status result = store->Rmw(context, callback, idx + 1); + ASSERT_TRUE(result == Status::Ok || result == Status::Pending); + if(idx % 256 == 0) { + store->Refresh(); + store->CompletePending(false); + } + } + + while(num_threads_started < kNumThreads) { + std::this_thread::yield(); + } + // checkpoint (transition from REST to INDEX_CHKPT) + ASSERT_TRUE(store->Checkpoint(persistence_callback)); + + // Ensure that the checkpoint completes. + while(num_threads_persistent < kNumThreads) { + store->CompletePending(false); + } + + bool result = store->CompletePending(true); + ASSERT_TRUE(result); + store->StopSession(); + }; + + auto rmw_worker = [](store_t* store, uint32_t thread_id) { + assert(thread_id != 0); + session_ids[thread_id] = store->StartSession(); + ++num_threads_started; + + // update some records + for(uint32_t idx = kNumRecordsPerThread * thread_id; + idx < kNumRecordsPerThread * (thread_id + 1); ++idx) { + auto callback = [](IAsyncContext* context, Status result) { + ASSERT_EQ(Status::Ok, result); + }; + RmwContext context{ Key{ idx }, 230 }; + Status result = store->Rmw(context, callback, idx + 1); + ASSERT_TRUE(result == Status::Ok || result == Status::Pending); + if(idx % 256 == 0) { + store->Refresh(); + store->CompletePending(false); + } + } + + // Don't exit this session until the checkpoint has completed. + while(num_threads_persistent < kNumThreads) { + store->CompletePending(false); + } + + bool result = store->CompletePending(true); + ASSERT_TRUE(result); + store->StopSession(); + }; + + threads.clear(); + threads.emplace_back(rmw_checkpoint_worker, &store, 0); + for(uint32_t idx = 1; idx < kNumThreads; ++idx) { + threads.emplace_back(rmw_worker, &store, idx); + } + for(auto& thread : threads) { + thread.join(); + } + + // Verify the store. + store.StartSession(); + for(uint32_t idx = 0; idx < kNumRecords; ++idx) { + auto callback = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + ASSERT_EQ(Status::Ok, result); + ASSERT_EQ(context->expected, context->val()); + }; + + ReadContext1 context{ Key{ idx }, idx + 230 }; + Status result = store.Read(context, callback, 1); + if(result != Status::Ok) { + ASSERT_EQ(Status::Pending, result); + } + } + store.StopSession(); + } + + // Test recovery. + store_t new_store{ 524288 * 2, 402653184, "storage", 0.4 }; + + std::vector recovered_session_ids; + Status status = new_store.Recover(1, 1, recovered_session_ids); + ASSERT_EQ(recovered_session_ids.size(), kNumThreads); + ASSERT_EQ(Status::Ok, status); + + static std::atomic records_read; + records_read = 0; + + class ReadContext2 : public IAsyncContext { + public: + typedef Key key_t; + typedef Value value_t; + + ReadContext2(Key key, uint32_t expected_, uint32_t idx_, std::atomic* found_) + : key_{ key } + , val_{ 0 } + , expected{ expected_ } + , idx{ idx_ } + , found{ found_ } { + } + + /// Copy (and deep-copy) constructor. + ReadContext2(const ReadContext2& other) + : key_{ other.key_ } + , val_{ other.val_ } + , expected{ other.expected } + , idx{ other.idx } + , found{ other.found } { + } + + /// The implicit and explicit interfaces require a key() accessor. + inline const Key& key() const { + return key_; + } + + inline void Get(const Value& value) { + val_ = value.val_; + } + inline void GetAtomic(const Value& value) { + val_ = value.atomic_val_.load(); + } + + uint64_t val() const { + return val_; + } + + protected: + /// The explicit interface requires a DeepCopy_Internal() implementation. + Status DeepCopy_Internal(IAsyncContext*& context_copy) { + return IAsyncContext::DeepCopy_Internal(*this, context_copy); + } + + private: + Key key_; + uint32_t val_; + public: + const uint32_t expected; + const uint32_t idx; + std::atomic* found; + }; + + auto read_worker = [](store_t* store, uint32_t thread_id) { + uint64_t serial_num = store->ContinueSession(session_ids[thread_id]); + ASSERT_GE(serial_num, 1); + + std::unique_ptr> found{ new std::atomic[kNumRecordsPerThread] }; + std::memset(found.get(), 0, sizeof(found.get()[0]) * kNumRecordsPerThread); + + // verify records + auto callback = [](IAsyncContext* ctxt, Status result) { + CallbackContext context{ ctxt }; + ASSERT_EQ(Status::Ok, result); + if(context->expected == context->val()) { + bool expected = false; + ASSERT_TRUE(context->found[context->idx].compare_exchange_strong(expected, true)); + } else { + ASSERT_EQ(context->expected - 230, context->val()); + bool expected = false; + ASSERT_FALSE(context->found[context->idx].load()); + } + }; + for(uint32_t idx = kNumRecordsPerThread * thread_id; + idx < kNumRecordsPerThread * (thread_id + 1); ++idx) { + ReadContext2 context{ Key{ idx }, idx + 230, idx - (kNumRecordsPerThread * thread_id), + found.get() }; + Status result = store->Read(context, callback, 1); + if(result == Status::Ok) { + ++records_read; + if(context.expected == context.val()) { + bool expected = false; + ASSERT_TRUE(found.get()[context.idx].compare_exchange_strong(expected, true)); + } else { + ASSERT_EQ(idx, context.val()); + bool expected = false; + ASSERT_FALSE(found.get()[context.idx].load()); + } + } else { + ASSERT_EQ(Status::Pending, result); + } + if(idx % 256 == 0) { + store->Refresh(); + store->CompletePending(false); + } + } + store->CompletePending(true); + store->StopSession(); + + bool found_all = true; + for(uint32_t idx = 0; idx < kNumRecordsPerThread; ++idx) { + if(found_all != found.get()[idx]) { + // Consistent-point recovery implies that after one record isn't found, all subsequent + // records will not be found. + Key key{ kNumRecordsPerThread* thread_id + idx }; + KeyHash hash = key.GetHash(); + std::string error; + error += "key = "; + error += std::to_string(kNumRecordsPerThread* thread_id + idx); + error += ", idx = "; + error += std::to_string(hash.idx(8192)); + error += ", tag = "; + error += std::to_string(hash.tag()); + ASSERT_TRUE(found_all) << error; + found_all = false; + } + } + }; + + std::deque threads{}; + for(uint32_t idx = 0; idx < kNumThreads; ++idx) { + threads.emplace_back(read_worker, &new_store, idx); + } + for(auto& thread : threads) { + thread.join(); + } + + ASSERT_GT(records_read, (uint32_t)0); + ASSERT_LE(records_read, kNumRecords); +} diff --git a/cc/test/recovery_threadpool_test.cc b/cc/test/recovery_threadpool_test.cc new file mode 100644 index 000000000..cbe9728f7 --- /dev/null +++ b/cc/test/recovery_threadpool_test.cc @@ -0,0 +1,31 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#include +#include +#include +#include +#include +#include +#include +#include +#include "gtest/gtest.h" +#include "core/faster.h" +#include "core/light_epoch.h" +#include "core/thread.h" +#include "device/file_system_disk.h" + +using namespace FASTER::core; + +typedef FASTER::environment::ThreadPoolIoHandler handler_t; + +#define CLASS RecoveryTest_ThreadPool + +#include "recovery_test.h" + +#undef CLASS + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} \ No newline at end of file diff --git a/cc/test/utility_test.cc b/cc/test/utility_test.cc new file mode 100644 index 000000000..4c21007d3 --- /dev/null +++ b/cc/test/utility_test.cc @@ -0,0 +1,25 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#include +#include "gtest/gtest.h" + +#include "core/auto_ptr.h" + +using namespace FASTER::core; + +TEST(UtilityTest, NextPowerOfTwo) { + EXPECT_EQ(1, next_power_of_two(1)); + EXPECT_EQ(2, next_power_of_two(2)); + EXPECT_EQ(4, next_power_of_two(3)); + EXPECT_EQ(4, next_power_of_two(4)); + EXPECT_EQ(8, next_power_of_two(5)); + EXPECT_EQ(8, next_power_of_two(6)); + EXPECT_EQ(8, next_power_of_two(7)); + EXPECT_EQ(8, next_power_of_two(8)); +} + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} \ No newline at end of file diff --git a/cs/src/FASTER.sln b/cs/src/FASTER.sln new file mode 100644 index 000000000..94c309df8 --- /dev/null +++ b/cs/src/FASTER.sln @@ -0,0 +1,149 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.27004.2008 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "FASTER.benchmark", "benchmark\FASTER.benchmark.csproj", "{33A732D1-2B58-4FEE-9696-B9483496229F}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "FASTER.test", "test\FASTER.test.csproj", "{0DC7F5A2-E963-4E7F-BD37-6F7864B726F2}" +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "native", "native", "{6D05489A-B06F-4946-AF59-887A14D83171}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "readtsc", "native\readtsc\readtsc.vcxproj", "{A6510B80-BD50-4C11-9712-64C3B3865AFF}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "adv-file-ops", "native\adv-file-ops\adv-file-ops.vcxproj", "{5852AC33-6B01-44F5-BAF3-2AAF796E8449}" +EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "FASTER.core", "core\FASTER.core.csproj", "{01002755-60CA-40EE-94D9-11C07EB58786}" + ProjectSection(ProjectDependencies) = postProject + {5852AC33-6B01-44F5-BAF3-2AAF796E8449} = {5852AC33-6B01-44F5-BAF3-2AAF796E8449} + {A6510B80-BD50-4C11-9712-64C3B3865AFF} = {A6510B80-BD50-4C11-9712-64C3B3865AFF} + EndProjectSection +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "playground", "playground", "{E6026D6A-01C5-4582-B2C1-64751490DABE}" +EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ManagedSample1", "playground\ManagedSample1\ManagedSample1.csproj", "{17BDD0A5-98E5-464A-8A00-050D9FF4C562}" +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "core", "core", "{EE591221-F22E-49B3-837C-1921302082DC}" +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "benchmark", "benchmark", "{CA6AB459-A31A-4C15-B1A6-A82C349B54B4}" +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "test", "test", "{81B3B5D1-70F6-4979-AC76-003F9A6B316B}" +EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ManagedSample2", "playground\ManagedSample2\ManagedSample2.csproj", "{7DB87633-9CAB-4AE4-9ED0-AA6E77448486}" +EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ManagedSample3", "playground\ManagedSample3\ManagedSample3.csproj", "{3E571C7C-59B5-485C-AC78-3F34D3511CD2}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SumStore", "playground\SumStore\SumStore.csproj", "{05D61B37-9714-4234-9961-384A63F7175E}" +EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ManagedSample4", "playground\ManagedSample4\ManagedSample4.csproj", "{E1AC9797-ABE3-4881-A51B-37D8687AAE35}" +EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ClassCache", "playground\ClassCache\ClassCache.csproj", "{10FD4868-BB16-442B-B0AC-18AE278D9C60}" +EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "NestedTypesTest", "playground\NestedTypesTest\NestedTypesTest.csproj", "{2D5F23F7-3184-43EC-A7F1-C924F7FEF786}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {33A732D1-2B58-4FEE-9696-B9483496229F}.Debug|x64.ActiveCfg = Debug|x64 + {33A732D1-2B58-4FEE-9696-B9483496229F}.Debug|x64.Build.0 = Debug|x64 + {33A732D1-2B58-4FEE-9696-B9483496229F}.Release|x64.ActiveCfg = Release|x64 + {33A732D1-2B58-4FEE-9696-B9483496229F}.Release|x64.Build.0 = Release|x64 + {0DC7F5A2-E963-4E7F-BD37-6F7864B726F2}.Debug|x64.ActiveCfg = Debug|x64 + {0DC7F5A2-E963-4E7F-BD37-6F7864B726F2}.Debug|x64.Build.0 = Debug|x64 + {0DC7F5A2-E963-4E7F-BD37-6F7864B726F2}.Release|x64.ActiveCfg = Release|x64 + {0DC7F5A2-E963-4E7F-BD37-6F7864B726F2}.Release|x64.Build.0 = Release|x64 + {A6510B80-BD50-4C11-9712-64C3B3865AFF}.Debug|x64.ActiveCfg = Release|x64 + {A6510B80-BD50-4C11-9712-64C3B3865AFF}.Debug|x64.Build.0 = Release|x64 + {A6510B80-BD50-4C11-9712-64C3B3865AFF}.Release|x64.ActiveCfg = Release|x64 + {A6510B80-BD50-4C11-9712-64C3B3865AFF}.Release|x64.Build.0 = Release|x64 + {5852AC33-6B01-44F5-BAF3-2AAF796E8449}.Debug|x64.ActiveCfg = Release|x64 + {5852AC33-6B01-44F5-BAF3-2AAF796E8449}.Debug|x64.Build.0 = Release|x64 + {5852AC33-6B01-44F5-BAF3-2AAF796E8449}.Release|x64.ActiveCfg = Release|x64 + {5852AC33-6B01-44F5-BAF3-2AAF796E8449}.Release|x64.Build.0 = Release|x64 + {01002755-60CA-40EE-94D9-11C07EB58786}.Debug|x64.ActiveCfg = Debug|x64 + {01002755-60CA-40EE-94D9-11C07EB58786}.Debug|x64.Build.0 = Debug|x64 + {01002755-60CA-40EE-94D9-11C07EB58786}.Release|x64.ActiveCfg = Release|x64 + {01002755-60CA-40EE-94D9-11C07EB58786}.Release|x64.Build.0 = Release|x64 + {17BDD0A5-98E5-464A-8A00-050D9FF4C562}.Debug|x64.ActiveCfg = Debug|x64 + {17BDD0A5-98E5-464A-8A00-050D9FF4C562}.Debug|x64.Build.0 = Debug|x64 + {17BDD0A5-98E5-464A-8A00-050D9FF4C562}.Release|x64.ActiveCfg = Release|x64 + {17BDD0A5-98E5-464A-8A00-050D9FF4C562}.Release|x64.Build.0 = Release|x64 + {7DB87633-9CAB-4AE4-9ED0-AA6E77448486}.Debug|x64.ActiveCfg = Debug|x64 + {7DB87633-9CAB-4AE4-9ED0-AA6E77448486}.Debug|x64.Build.0 = Debug|x64 + {7DB87633-9CAB-4AE4-9ED0-AA6E77448486}.Release|x64.ActiveCfg = Release|x64 + {7DB87633-9CAB-4AE4-9ED0-AA6E77448486}.Release|x64.Build.0 = Release|x64 + {3E571C7C-59B5-485C-AC78-3F34D3511CD2}.Debug|x64.ActiveCfg = Debug|x64 + {3E571C7C-59B5-485C-AC78-3F34D3511CD2}.Debug|x64.Build.0 = Debug|x64 + {3E571C7C-59B5-485C-AC78-3F34D3511CD2}.Release|x64.ActiveCfg = Release|x64 + {3E571C7C-59B5-485C-AC78-3F34D3511CD2}.Release|x64.Build.0 = Release|x64 + {05D61B37-9714-4234-9961-384A63F7175E}.Debug|x64.ActiveCfg = Debug|x64 + {05D61B37-9714-4234-9961-384A63F7175E}.Debug|x64.Build.0 = Debug|x64 + {05D61B37-9714-4234-9961-384A63F7175E}.Release|x64.ActiveCfg = Release|x64 + {05D61B37-9714-4234-9961-384A63F7175E}.Release|x64.Build.0 = Release|x64 + {E1AC9797-ABE3-4881-A51B-37D8687AAE35}.Debug|x64.ActiveCfg = Debug|x64 + {E1AC9797-ABE3-4881-A51B-37D8687AAE35}.Debug|x64.Build.0 = Debug|x64 + {E1AC9797-ABE3-4881-A51B-37D8687AAE35}.Release|x64.ActiveCfg = Release|x64 + {E1AC9797-ABE3-4881-A51B-37D8687AAE35}.Release|x64.Build.0 = Release|x64 + {10FD4868-BB16-442B-B0AC-18AE278D9C60}.Debug|x64.ActiveCfg = Debug|x64 + {10FD4868-BB16-442B-B0AC-18AE278D9C60}.Debug|x64.Build.0 = Debug|x64 + {10FD4868-BB16-442B-B0AC-18AE278D9C60}.Release|x64.ActiveCfg = Release|x64 + {10FD4868-BB16-442B-B0AC-18AE278D9C60}.Release|x64.Build.0 = Release|x64 + {2D5F23F7-3184-43EC-A7F1-C924F7FEF786}.Debug|x64.ActiveCfg = Debug|x64 + {2D5F23F7-3184-43EC-A7F1-C924F7FEF786}.Debug|x64.Build.0 = Debug|x64 + {2D5F23F7-3184-43EC-A7F1-C924F7FEF786}.Release|x64.ActiveCfg = Release|x64 + {2D5F23F7-3184-43EC-A7F1-C924F7FEF786}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(NestedProjects) = preSolution + {33A732D1-2B58-4FEE-9696-B9483496229F} = {CA6AB459-A31A-4C15-B1A6-A82C349B54B4} + {0DC7F5A2-E963-4E7F-BD37-6F7864B726F2} = {81B3B5D1-70F6-4979-AC76-003F9A6B316B} + {A6510B80-BD50-4C11-9712-64C3B3865AFF} = {6D05489A-B06F-4946-AF59-887A14D83171} + {5852AC33-6B01-44F5-BAF3-2AAF796E8449} = {6D05489A-B06F-4946-AF59-887A14D83171} + {01002755-60CA-40EE-94D9-11C07EB58786} = {EE591221-F22E-49B3-837C-1921302082DC} + {17BDD0A5-98E5-464A-8A00-050D9FF4C562} = {E6026D6A-01C5-4582-B2C1-64751490DABE} + {7DB87633-9CAB-4AE4-9ED0-AA6E77448486} = {E6026D6A-01C5-4582-B2C1-64751490DABE} + {3E571C7C-59B5-485C-AC78-3F34D3511CD2} = {E6026D6A-01C5-4582-B2C1-64751490DABE} + {05D61B37-9714-4234-9961-384A63F7175E} = {E6026D6A-01C5-4582-B2C1-64751490DABE} + {E1AC9797-ABE3-4881-A51B-37D8687AAE35} = {E6026D6A-01C5-4582-B2C1-64751490DABE} + {10FD4868-BB16-442B-B0AC-18AE278D9C60} = {E6026D6A-01C5-4582-B2C1-64751490DABE} + {2D5F23F7-3184-43EC-A7F1-C924F7FEF786} = {E6026D6A-01C5-4582-B2C1-64751490DABE} + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {A0750637-2CCB-4139-B25E-F2CE740DCFAC} + EndGlobalSection + GlobalSection(Performance) = preSolution + HasPerformanceSessions = true + EndGlobalSection + GlobalSection(Performance) = preSolution + HasPerformanceSessions = true + EndGlobalSection + GlobalSection(Performance) = preSolution + HasPerformanceSessions = true + EndGlobalSection + GlobalSection(Performance) = preSolution + HasPerformanceSessions = true + EndGlobalSection + GlobalSection(Performance) = preSolution + HasPerformanceSessions = true + EndGlobalSection + GlobalSection(Performance) = preSolution + HasPerformanceSessions = true + EndGlobalSection + GlobalSection(Performance) = preSolution + HasPerformanceSessions = true + EndGlobalSection + GlobalSection(Performance) = preSolution + HasPerformanceSessions = true + EndGlobalSection + GlobalSection(Performance) = preSolution + HasPerformanceSessions = true + EndGlobalSection + GlobalSection(Performance) = preSolution + HasPerformanceSessions = true + EndGlobalSection +EndGlobal diff --git a/cs/src/benchmark/App.config b/cs/src/benchmark/App.config new file mode 100644 index 000000000..306dce5bf --- /dev/null +++ b/cs/src/benchmark/App.config @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/cs/src/benchmark/FASTER.benchmark.csproj b/cs/src/benchmark/FASTER.benchmark.csproj new file mode 100644 index 000000000..c2745a6c0 --- /dev/null +++ b/cs/src/benchmark/FASTER.benchmark.csproj @@ -0,0 +1,43 @@ + + + + net46 + x64 + win7-x64 + + + + Exe + true + FASTER.benchmark + prompt + MinimumRecommendedRules.ruleset + PackageReference + true + + + + TRACE;DEBUG + full + true + bin\x64\Debug\ + + + TRACE + pdbonly + true + bin\x64\Release\ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/cs/src/benchmark/FasterYcsbBenchmark.cs b/cs/src/benchmark/FasterYcsbBenchmark.cs new file mode 100644 index 000000000..444d0fd11 --- /dev/null +++ b/cs/src/benchmark/FasterYcsbBenchmark.cs @@ -0,0 +1,585 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma warning disable 0162 + +//#define DASHBOARD +//#define USE_CODEGEN + +using FASTER.core; +using System; +using System.Diagnostics; +using System.IO; +using System.Net; +using System.Runtime.InteropServices; +using System.Threading; + +namespace FASTER.benchmark +{ + public unsafe class FASTER_YcsbBenchmark + { + public enum Op : ulong + { + Upsert = 0, + Read = 1, + ReadModifyWrite = 2 + } + + const long kInitCount = 250000000; + const long kTxnCount = 1000000000; + const int kFileChunkSize = 4096; + const long kChunkSize = 640; + const bool kUseSyntheticData = false; + + Key[] init_keys_; + + Key[] txn_keys_; + Key* txn_keys_ptr; + + long idx_ = 0; + + Input[] input_; + Input* input_ptr; + readonly IDevice device; + +#if USE_CODEGEN + IFASTER +#else + FasterKV +#endif + store; + + long total_ops_done = 0; + + const string kKeyWorkload = "a"; + readonly int threadCount; + readonly int numaStyle; + readonly string distribution; + readonly int readPercent; + + const int kMaxKey = 268435456; + const int kRunSeconds = 30; + const int kCheckpointSeconds = -1; + + volatile bool done = false; + + public FASTER_YcsbBenchmark(int threadCount_, int numaStyle_, string distribution_, int readPercent_) + { + threadCount = threadCount_; + numaStyle = numaStyle_; + distribution = distribution_; + readPercent = readPercent_; + +#if DASHBOARD + statsWritten = new AutoResetEvent[threadCount]; + for (int i = 0; i < threadCount; i++) + { + statsWritten[i] = new AutoResetEvent(false); + } + threadThroughput = new double[threadCount]; + threadAverageLatency = new double[threadCount]; + threadMaximumLatency = new double[threadCount]; + threadProgress = new long[threadCount]; + writeStats = new bool[threadCount]; + freq = HiResTimer.EstimateCPUFrequency(); +#endif + + device = FASTERFactory.CreateLogDevice("D:\\data\\hlog"); + +#if USE_CODEGEN + store = FASTERFactory.Create +#else + store = new FasterKV +#endif + (kMaxKey / 2, device); + } + + private void SetupYcsb(int thread_idx) + { + if (numaStyle == 0) + Native32.AffinitizeThreadRoundRobin((uint)thread_idx); + else + Native32.AffinitizeThreadShardedTwoNuma((uint)thread_idx); + + store.StartSession(); + +#if DASHBOARD + var tstart = HiResTimer.Rdtsc(); + var tstop1 = tstart; + var lastWrittenValue = 0; + int count = 0; +#endif + + Value value = default(Value); + + for (long chunk_idx = Interlocked.Add(ref idx_, kChunkSize) - kChunkSize; + chunk_idx < kInitCount; + chunk_idx = Interlocked.Add(ref idx_, kChunkSize) - kChunkSize) + { + for (long idx = chunk_idx; idx < chunk_idx + kChunkSize; ++idx) + { + if (idx % 256 == 0) + { + store.Refresh(); + + if (idx % 65536 == 0) + { + store.CompletePending(false); + } + } + + Key key = init_keys_[idx]; + store.Upsert(&key, &value, null, 1); + } +#if DASHBOARD + count += (int)kChunkSize; + + //Check if stats collector is requesting for statistics + if (writeStats[thread_idx]) + { + var tstart1 = tstop1; + tstop1 = HiResTimer.Rdtsc(); + threadThroughput[thread_idx] = (count - lastWrittenValue) / ((tstop1 - tstart1) / freq); + lastWrittenValue = count; + writeStats[thread_idx] = false; + statsWritten[thread_idx].Set(); + } +#endif + } + + + store.CompletePending(true); + store.StopSession(); + } + private void RunYcsb(int thread_idx) + { + RandomGenerator rng = new RandomGenerator((uint)(1 + thread_idx)); + + if (numaStyle == 0) + Native32.AffinitizeThreadRoundRobin((uint)thread_idx); + else + Native32.AffinitizeThreadShardedTwoNuma((uint)thread_idx); + + Stopwatch sw = new Stopwatch(); + sw.Start(); + + Value value = default(Value); + long reads_done = 0; + long writes_done = 0; + +#if DASHBOARD + var tstart = HiResTimer.Rdtsc(); + var tstop1 = tstart; + var lastWrittenValue = 0; + int count = 0; +#endif + + store.StartSession(); + + while (!done) + { + long chunk_idx = Interlocked.Add(ref idx_, kChunkSize) - kChunkSize; + while (chunk_idx >= kTxnCount) + { + if (chunk_idx == kTxnCount) + idx_ = 0; + chunk_idx = Interlocked.Add(ref idx_, kChunkSize) - kChunkSize; + } + + var local_txn_keys_ptr = txn_keys_ptr + chunk_idx; + + for (long idx = chunk_idx; idx < chunk_idx + kChunkSize && !done; ++idx, ++local_txn_keys_ptr) + { + Op op; + int r = (int)rng.Generate(100); + if (r < readPercent) + op = Op.Read; + else if (readPercent >= 0) + op = Op.Upsert; + else + op = Op.ReadModifyWrite; + + if (idx % 256 == 0) + { + store.Refresh(); + + if (idx % 65536 == 0) + { + store.CompletePending(false); + } + } + + switch (op) + { + case Op.Upsert: + { + store.Upsert(local_txn_keys_ptr, &value, null, 1); + ++writes_done; + break; + } + case Op.Read: + { + Status result = store.Read(local_txn_keys_ptr, null, (Output*)&value, null, 1); + if (result == Status.OK) + { + ++reads_done; + } + break; + } + case Op.ReadModifyWrite: + { + Status result = store.RMW(local_txn_keys_ptr, input_ptr + (idx & 0x7), null, 1); + if (result == Status.OK) + { + ++writes_done; + } + break; + } + default: + throw new NotImplementedException("Unexpected op: " + op); + } + } + +#if DASHBOARD + count += (int)kChunkSize; + + //Check if stats collector is requesting for statistics + if (writeStats[thread_idx]) + { + var tstart1 = tstop1; + tstop1 = HiResTimer.Rdtsc(); + threadProgress[thread_idx] = count; + threadThroughput[thread_idx] = (count - lastWrittenValue) / ((tstop1 - tstart1) / freq); + lastWrittenValue = count; + writeStats[thread_idx] = false; + statsWritten[thread_idx].Set(); + } +#endif + } + + store.CompletePending(true); + store.StopSession(); + sw.Stop(); + + Console.WriteLine("Thread " + thread_idx + " done; " + reads_done + " reads, " + + writes_done + " writes, in " + sw.ElapsedMilliseconds + " ms."); + Interlocked.Add(ref total_ops_done, reads_done + writes_done); + } + +#if DASHBOARD + int measurementInterval = 2000; + bool allDone; + bool measureLatency; + bool[] writeStats; + private EventWaitHandle[] statsWritten; + double[] threadThroughput; + double[] threadAverageLatency; + double[] threadMaximumLatency; + long[] threadProgress; + double freq; + + void DoContinuousMeasurements() + { + + if (numaStyle == 0) + Native32.AffinitizeThreadRoundRobin((uint)threadCount + 1); + else + Native32.AffinitizeThreadShardedTwoNuma((uint)threadCount + 1); + + double totalThroughput, totalLatency, maximumLatency; + double totalProgress; + int ver = 0; + + using (var client = new WebClient()) + { + while (!allDone) + { + ver++; + + Thread.Sleep(measurementInterval); + + totalProgress = 0; + totalThroughput = 0; + totalLatency = 0; + maximumLatency = 0; + + for (int i = 0; i < threadCount; i++) + { + writeStats[i] = true; + } + + + for (int i = 0; i < threadCount; i++) + { + statsWritten[i].WaitOne(); + totalThroughput += threadThroughput[i]; + totalProgress += threadProgress[i]; + if (measureLatency) + { + totalLatency += threadAverageLatency[i]; + if (threadMaximumLatency[i] > maximumLatency) + { + maximumLatency = threadMaximumLatency[i]; + } + } + } + + if (measureLatency) + { + Console.WriteLine("{0} \t {1:0.000} \t {2} \t {3} \t {4} \t {5}", ver, totalThroughput / (double)1000000, totalLatency / threadCount, maximumLatency, store.Size, totalProgress); + } + else + { + Console.WriteLine("{0} \t {1:0.000} \t {2} \t {3}", ver, totalThroughput / (double)1000000, store.Size, totalProgress); + } + } + } + } +#endif + + #region Load Data + + private void LoadDataFromFile(string filePath) + { + string init_filename = filePath + "\\load_" + distribution + "_250M_raw.dat"; + string txn_filename = filePath + "\\run_" + distribution + "_250M_1000M_raw.dat"; + + long count = 0; + using (FileStream stream = File.Open(init_filename, FileMode.Open, FileAccess.Read, + FileShare.Read)) + { + Console.WriteLine("loading keys from " + init_filename + " into memory..."); + init_keys_ = new Key[kInitCount]; + + byte[] chunk = new byte[kFileChunkSize]; + GCHandle chunk_handle = GCHandle.Alloc(chunk, GCHandleType.Pinned); + byte* chunk_ptr = (byte*)chunk_handle.AddrOfPinnedObject(); + + long offset = 0; + + while (true) + { + stream.Position = offset; + int size = stream.Read(chunk, 0, kFileChunkSize); + for (int idx = 0; idx < size; idx += Key.kSizeInBytes) + { + init_keys_[count] = *((Key*)(chunk_ptr + idx)); + ++count; + } + if (size == kFileChunkSize) + offset += kFileChunkSize; + else + break; + + if (count == kInitCount) + break; + } + + if (count != kInitCount) + { + throw new InvalidDataException("Init file load fail!"); + } + } + + Console.WriteLine("loaded " + kInitCount + " keys."); + + + using (FileStream stream = File.Open(txn_filename, FileMode.Open, FileAccess.Read, FileShare.Read)) + { + byte[] chunk = new byte[kFileChunkSize]; + GCHandle chunk_handle = GCHandle.Alloc(chunk, GCHandleType.Pinned); + byte* chunk_ptr = (byte*)chunk_handle.AddrOfPinnedObject(); + + Console.WriteLine("loading txns from " + txn_filename + " into memory..."); + + txn_keys_ = new Key[kTxnCount]; + GCHandle handle2 = GCHandle.Alloc(txn_keys_, GCHandleType.Pinned); + txn_keys_ptr = (Key*)handle2.AddrOfPinnedObject(); + + count = 0; + long offset = 0; + + while (true) + { + stream.Position = offset; + int size = stream.Read(chunk, 0, kFileChunkSize); + for (int idx = 0; idx < size; idx += Key.kSizeInBytes) + { + txn_keys_[count] = *((Key*)(chunk_ptr + idx)); + ++count; + } + if (size == kFileChunkSize) + offset += kFileChunkSize; + else + break; + + if (count == kTxnCount) + break; + } + + if (count != kTxnCount) + { + throw new InvalidDataException("Txn file load fail!" + count + ":" + kTxnCount); + } + } + } + + private void LoadData() + { + if (kUseSyntheticData) + { + LoadSyntheticData(); + return; + } + + string filePath = "C:\\ycsb_files"; + + if (!Directory.Exists(filePath)) + { + filePath = "D:\\ycsb_files"; + } + if (!Directory.Exists(filePath)) + { + filePath = "E:\\ycsb_files"; + } + + if (Directory.Exists(filePath)) + { + LoadDataFromFile(filePath); + } + else + { + Console.WriteLine("WARNING: Could not find YCSB directory, loading synthetic data instead"); + LoadSyntheticData(); + } + } + + private void LoadSyntheticData() + { + init_keys_ = new Key[kInitCount]; + long val = 0; + for (int idx = 0; idx < kInitCount; idx++) + { + init_keys_[idx] = new Key { value = val++ }; + } + + Console.WriteLine("loaded " + kInitCount + " keys."); + + RandomGenerator generator = new RandomGenerator(); + + txn_keys_ = new Key[kTxnCount]; + GCHandle handle2 = GCHandle.Alloc(txn_keys_, GCHandleType.Pinned); + txn_keys_ptr = (Key*)handle2.AddrOfPinnedObject(); + + for (int idx = 0; idx < kTxnCount; idx++) + { + txn_keys_[idx] = new Key { value = (long)generator.Generate64(kInitCount) }; + } + + Console.WriteLine("loaded " + kTxnCount + " txns."); + + } + #endregion + + public unsafe void Run() + { + RandomGenerator rng = new RandomGenerator(); + + LoadData(); + + input_ = new Input[8]; + for (int i = 0; i < 8; i++) + { + input_[i].value = i; + } + GCHandle handle = GCHandle.Alloc(input_, GCHandleType.Pinned); + input_ptr = (Input*)handle.AddrOfPinnedObject(); + + Console.WriteLine("loaded " + kTxnCount + " txns."); + +#if DASHBOARD + var dash = new Thread(() => DoContinuousMeasurements()); + dash.Start(); +#endif + + Thread[] workers = new Thread[threadCount]; + + Console.WriteLine("Executing setup."); + + // Setup the store for the YCSB benchmark. + for (int idx = 0; idx < threadCount; ++idx) + { + int x = idx; + workers[idx] = new Thread(() => SetupYcsb(x)); + } + // Start threads. + foreach (Thread worker in workers) + { + worker.Start(); + } + foreach (Thread worker in workers) + { + worker.Join(); + } + + long startTailAddress = store.Size; + Console.WriteLine("Start tail address = " + startTailAddress); + + + idx_ = 0; + store.DumpDistribution(); + + Console.WriteLine("Executing experiment."); + + // Run the experiment. + for (int idx = 0; idx < threadCount; ++idx) + { + int x = idx; + workers[idx] = new Thread(() => RunYcsb(x)); + } + // Start threads. + foreach (Thread worker in workers) + { + worker.Start(); + } + + Stopwatch swatch = new Stopwatch(); + swatch.Start(); + + if (kCheckpointSeconds <= 0) + { + Thread.Sleep(TimeSpan.FromSeconds(kRunSeconds)); + } + else + { + int runSeconds = 0; + while (runSeconds < kRunSeconds) + { + Thread.Sleep(TimeSpan.FromSeconds(kCheckpointSeconds)); + store.TakeFullCheckpoint(out Guid token); + runSeconds += kCheckpointSeconds; + } + } + + swatch.Stop(); + + done = true; + + foreach (Thread worker in workers) + { + worker.Join(); + } + + double seconds = swatch.ElapsedMilliseconds / 1000.0; + long endTailAddress = store.Size; + Console.WriteLine("End tail address = " + endTailAddress); + + Console.WriteLine("Total " + total_ops_done + " ops done " + " in " + seconds + " secs."); + Console.WriteLine("##, " + distribution + ", " + numaStyle + ", " + readPercent + ", " + + threadCount + ", " + total_ops_done / seconds + ", " + + (endTailAddress - startTailAddress)); + + Console.ReadLine(); + } + } +} diff --git a/cs/src/benchmark/Program.cs b/cs/src/benchmark/Program.cs new file mode 100644 index 000000000..37279570f --- /dev/null +++ b/cs/src/benchmark/Program.cs @@ -0,0 +1,58 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System; +using FASTER.core; +using CommandLine; + +namespace FASTER.benchmark +{ + class Options + { + [Option('b', "benchmark", Required = false, Default = 0, + HelpText = "Benchmark to run (0 - YCSB)")] + public int Benchmark { get; set; } + + [Option('t', "threads", Required = false, Default = 8, + HelpText = "Number of threads to run the workload.")] + public int ThreadCount { get; set; } + + [Option('n', "numa", Required = false, Default = 0, + HelpText = "0 = no numa, 1 = sharded numa")] + public int NumaStyle { get; set; } + + [Option('r', "read_percent", Required = false, Default = 50, + HelpText = "Percentage of reads (-1 for 100% RMW")] + public int ReadPercent { get; set; } + + [Option('d', "distribution", Required = false, Default = "uniform", + HelpText = "Distribution")] + public string Distribution { get; set; } + } + + enum BenchmarkType : int + { + Ycsb + }; + + public class Program + { + public static void Main(string[] args) + { + ParserResult result = Parser.Default.ParseArguments(args); + if (result.Tag == ParserResultType.NotParsed) + { + return; + } + + var options = result.MapResult(o => o, xs => new Options()); + BenchmarkType b = (BenchmarkType)options.Benchmark; + + if (b == BenchmarkType.Ycsb) + { + var test = new FASTER_YcsbBenchmark(options.ThreadCount, options.NumaStyle, options.Distribution, options.ReadPercent); + test.Run(); + } + } + } +} diff --git a/cs/src/benchmark/Properties/AssemblyInfo.cs b/cs/src/benchmark/Properties/AssemblyInfo.cs new file mode 100644 index 000000000..62e3fd7b9 --- /dev/null +++ b/cs/src/benchmark/Properties/AssemblyInfo.cs @@ -0,0 +1,39 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System.Reflection; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +// General Information about an assembly is controlled through the following +// set of attributes. Change these attribute values to modify the information +// associated with an assembly. +//[assembly: AssemblyTitle("FASTER.benchmark")] +[assembly: AssemblyDescription("")] +//[assembly: AssemblyConfiguration("")] +//[assembly: AssemblyCompany("")] +//[assembly: AssemblyProduct("FASTER.benchmark")] +[assembly: AssemblyCopyright("Copyright © 2015")] +[assembly: AssemblyTrademark("")] +[assembly: AssemblyCulture("")] + +// Setting ComVisible to false makes the types in this assembly not visible +// to COM components. If you need to access a type in this assembly from +// COM, set the ComVisible attribute to true on that type. +[assembly: ComVisible(false)] + +// The following GUID is for the ID of the typelib if this project is exposed to COM +[assembly: Guid("19ed104b-3dcc-42ed-9dc0-afa825042543")] + +// Version information for an assembly consists of the following four values: +// +// Major Version +// Minor Version +// Build Number +// Revision +// +// You can specify all the values or you can default the Build and Revision Numbers +// by using the '*' as shown below: +// [assembly: AssemblyVersion("1.0.*")] +//[assembly: AssemblyVersion("1.0.0.0")] +//[assembly: AssemblyFileVersion("1.0.0.0")] diff --git a/cs/src/benchmark/RandomGenerator.cs b/cs/src/benchmark/RandomGenerator.cs new file mode 100644 index 000000000..8dbc55637 --- /dev/null +++ b/cs/src/benchmark/RandomGenerator.cs @@ -0,0 +1,85 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Runtime.CompilerServices; +using System.Text; +using System.Threading.Tasks; + +namespace FASTER.core +{ + public class RandomGenerator + { + private uint x; + private uint y; + private uint z; + private uint w; + + public RandomGenerator(uint seed = 0) + { + if (seed == 0) + { + long counter = 0; + HiResTimer.QueryPerformanceCounter(ref counter); + x = (uint)(counter & 0x0FFFFFFF); + } + else + { + x = seed; + } + + y = 362436069; + z = 521288629; + w = 88675123; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public uint Generate() + { + uint t; + t = (x ^ (x << 11)); + x = y; + y = z; + z = w; + + return (w = (w ^ (w >> 19)) ^ (t ^ (t >> 8))); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public uint Generate(uint max) + { + uint t; + t = (x ^ (x << 11)); + x = y; + y = z; + z = w; + + return (w = (w ^ (w >> 19)) ^ (t ^ (t >> 8))) % max; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public ulong Generate64(ulong max) + { + uint t; + t = (x ^ (x << 11)); + x = y; + y = z; + z = w; + + ulong r = (w = (w ^ (w >> 19)) ^ (t ^ (t >> 8))); + + r <<= 32; + + t = (x ^ (x << 11)); + x = y; + y = z; + z = w; + + r |= ((w = (w ^ (w >> 19)) ^ (t ^ (t >> 8)))); + + return r % max; + } + } +} diff --git a/cs/src/core/Allocator/IAllocator.cs b/cs/src/core/Allocator/IAllocator.cs new file mode 100644 index 000000000..2e04cc109 --- /dev/null +++ b/cs/src/core/Allocator/IAllocator.cs @@ -0,0 +1,14 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +namespace FASTER.core +{ + public interface IAllocator + { + long Allocate(int numSlots); + long GetPhysicalAddress(long logicalAddress); + void CheckForAllocateComplete(ref long address); + int RecordSize { get; } + void Free(); + } +} diff --git a/cs/src/core/Allocator/MallocFixedPageSize.cs b/cs/src/core/Allocator/MallocFixedPageSize.cs new file mode 100644 index 000000000..28b5a1dc5 --- /dev/null +++ b/cs/src/core/Allocator/MallocFixedPageSize.cs @@ -0,0 +1,615 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#define CALLOC + +using System; +using System.Collections.Generic; +using System.Runtime.CompilerServices; +using System.Threading; +using System.Runtime.InteropServices; +using System.Collections.Concurrent; +using System.Linq.Expressions; +using System.IO; + +namespace FASTER.core +{ + public unsafe class MallocFixedPageSize + { + public static bool ForceUnpinnedAllocation = false; + + public static MallocFixedPageSize Instance = new MallocFixedPageSize(); + public static MallocFixedPageSize PhysicalInstance = new MallocFixedPageSize(true); + + protected const int PageSizeBits = 16; + internal const int PageSize = 1 << PageSizeBits; + protected const int PageSizeMask = PageSize - 1; + protected const int LevelSizeBits = 18; + protected const int LevelSize = 1 << LevelSizeBits; + protected const int LevelSizeMask = LevelSize - 1; + + protected T[][] values = new T[LevelSize][]; + protected GCHandle[] handles = new GCHandle[LevelSize]; + protected IntPtr[] pointers = new IntPtr[LevelSize]; + + protected T[] values0; + protected GCHandle handles0; + protected IntPtr pointers0; + protected readonly int RecordSize; + protected readonly int AlignedPageSize; + + protected volatile int writeCacheLevel; + + protected volatile int count; + + public readonly bool IsPinned; + public readonly bool ReturnPhysicalAddress; + + [ThreadStatic] + public static Queue freeList; +#if DEBUG + public ConcurrentBag> allQueues = new ConcurrentBag>(); +#endif + public MallocFixedPageSize(bool returnPhysicalAddress = false) + { + values[0] = new T[PageSize]; + +#if !(CALLOC) + Array.Clear(values[0], 0, PageSize); +#endif + ReturnPhysicalAddress = returnPhysicalAddress; + + if (ForceUnpinnedAllocation) + { + IsPinned = false; + ReturnPhysicalAddress = false; + } + else + { + IsPinned = true; + if (default(T) == null) + { + IsPinned = false; + ReturnPhysicalAddress = false; + } + else + { + try + { + handles[0] = GCHandle.Alloc(values[0], GCHandleType.Pinned); + pointers[0] = handles[0].AddrOfPinnedObject(); + handles0 = handles[0]; + pointers0 = pointers[0]; + RecordSize = Marshal.SizeOf(values[0][0]); + AlignedPageSize = RecordSize * PageSize; + } + catch (Exception) + { + IsPinned = false; + ReturnPhysicalAddress = false; + } + } + } + + values0 = values[0]; + writeCacheLevel = -1; + Interlocked.MemoryBarrier(); + + BulkAllocate(); // null pointer + } + + public void ReInitialize() + { + values = new T[LevelSize][]; + handles = new GCHandle[LevelSize]; + pointers = new IntPtr[LevelSize]; + values[0] = new T[PageSize]; + + +#if !(CALLOC) + Array.Clear(values[0], 0, PageSize); +#endif + + if (IsPinned) + { + handles[0] = GCHandle.Alloc(values[0], GCHandleType.Pinned); + pointers[0] = handles[0].AddrOfPinnedObject(); + handles0 = handles[0]; + pointers0 = pointers[0]; + } + + values0 = values[0]; + writeCacheLevel = -1; + Interlocked.MemoryBarrier(); + + BulkAllocate(); // null pointer + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public long GetPhysicalAddress(long address) + { + if (ReturnPhysicalAddress) + { + return address; + } + else + { + return + (long)pointers[address >> PageSizeBits] + + (long)(address & PageSizeMask) * RecordSize; + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public ref T Get(long index) + { + if (this.ReturnPhysicalAddress) + throw new Exception("Physical pointer returned by allocator: de-reference pointer to get records instead of calling Get"); + + return ref values + [index >> PageSizeBits] + [index & PageSizeMask]; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Set(long index, ref T value) + { + if (this.ReturnPhysicalAddress) + throw new Exception("Physical pointer returned by allocator: de-reference pointer to set records instead of calling Set (otherwise, set ForceUnpinnedAllocation to true)"); + + values + [index >> PageSizeBits] + [index & PageSizeMask] + = value; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Set(long index, T value) + { + Set(index, ref value); + } + + //static long _freed = 0; + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void FreeAtEpoch(long pointer, int removed_epoch = -1) + { + //if (Interlocked.Increment(ref _freed) % 100000 == 0) + //{ + // Console.WriteLine("Freed " + _freed); + //} + + if (!ReturnPhysicalAddress) + { + values[pointer >> PageSizeBits][pointer & PageSizeMask] = default(T); + } + if (freeList == null) freeList = new Queue(); + freeList.Enqueue(new FreeItem { removed_item = pointer, removal_epoch = removed_epoch }); + } + +#if DEBUG + public int TotalFreeCount() + { + int result = 0; + var x = allQueues.ToArray(); + foreach (var q in x) + { + result += q.Count; + } + return result; + } + + public int TotalUsedPointers() + { + return count - TotalFreeCount(); + } +#endif + public const int kAllocateChunkSize = 16; + + + /// + /// Warning: cannot mix 'n' match use of + /// Allocate and BulkAllocate + /// + /// + public long BulkAllocate() + { + // Determine insertion index. + // ReSharper disable once CSharpWarnings::CS0420 +#pragma warning disable 420 + int index = Interlocked.Add(ref count, kAllocateChunkSize) - kAllocateChunkSize; +#pragma warning restore 420 + + int offset = index & PageSizeMask; + int baseAddr = index >> PageSizeBits; + + // Handle indexes in first batch specially because they do not use write cache. + if (baseAddr == 0) + { + // If index 0, then allocate space for next level. + if (index == 0) + { + var tmp = new T[PageSize]; +#if !(CALLOC) + Array.Clear(tmp, 0, PageSize); +#endif + + if (IsPinned) + { + handles[1] = GCHandle.Alloc(tmp, GCHandleType.Pinned); + pointers[1] = handles[1].AddrOfPinnedObject(); + } + values[1] = tmp; + Interlocked.MemoryBarrier(); + } + + // Return location. + if (ReturnPhysicalAddress) + return (((long)pointers0) + index * RecordSize); + else + return index; + } + + // See if write cache contains corresponding array. + var cache = writeCacheLevel; + T[] array; + + if (cache != -1) + { + // Write cache is correct array only if index is within [arrayCapacity, 2*arrayCapacity). + if (cache == baseAddr) + { + // Return location. + if (ReturnPhysicalAddress) + return ((long)pointers[baseAddr]) + (long)offset * RecordSize; + else + return index; + } + } + + // Write cache did not work, so get level information from index. + // int level = GetLevelFromIndex(index); + + // Spin-wait until level has an allocated array. + var spinner = new SpinWait(); + while (true) + { + array = values[baseAddr]; + if (array != null) + { + break; + } + spinner.SpinOnce(); + } + + // Perform extra actions if inserting at offset 0 of level. + if (offset == 0) + { + // Update write cache to point to current level. + writeCacheLevel = baseAddr; + Interlocked.MemoryBarrier(); + + // Allocate for next page + int newBaseAddr = baseAddr + 1; + var tmp = new T[PageSize]; + +#if !(CALLOC) + Array.Clear(tmp, 0, PageSize); +#endif + + if (IsPinned) + { + handles[newBaseAddr] = GCHandle.Alloc(tmp, GCHandleType.Pinned); + pointers[newBaseAddr] = handles[newBaseAddr].AddrOfPinnedObject(); + } + values[newBaseAddr] = tmp; + + Interlocked.MemoryBarrier(); + } + + // Return location. + if (ReturnPhysicalAddress) + return ((long)pointers[baseAddr]) + (long)offset * RecordSize; + else + return index; + } + + //static long _allocated = 0; + public long Allocate() + { + //if (Interlocked.Increment(ref _allocated) % 100000 == 0) + //{ + // Console.WriteLine("Allocated " + _allocated); + //} + + if (freeList == null) + { + freeList = new Queue(); +#if DEBUG + allQueues.Add(freeList); +#endif + } + if (freeList.Count > 0) + { + if (freeList.Peek().removal_epoch <= LightEpoch.Instance.SafeToReclaimEpoch) + return freeList.Dequeue().removed_item; + + //if (freeList.Count % 64 == 0) + // LightEpoch.Instance.BumpCurrentEpoch(); + } + + // Determine insertion index. + // ReSharper disable once CSharpWarnings::CS0420 +#pragma warning disable 420 + int index = Interlocked.Increment(ref count) - 1; +#pragma warning restore 420 + + int offset = index & PageSizeMask; + int baseAddr = index >> PageSizeBits; + + // Handle indexes in first batch specially because they do not use write cache. + if (baseAddr == 0) + { + // If index 0, then allocate space for next level. + if (index == 0) + { + var tmp = new T[PageSize]; + +#if !(CALLOC) + Array.Clear(tmp, 0, PageSize); +#endif + + if (IsPinned) + { + handles[1] = GCHandle.Alloc(tmp, GCHandleType.Pinned); + pointers[1] = handles[1].AddrOfPinnedObject(); + } + values[1] = tmp; + Interlocked.MemoryBarrier(); + } + + // Return location. + if (ReturnPhysicalAddress) + return ((long)pointers0) + index * RecordSize; + else + return index; + } + + // See if write cache contains corresponding array. + var cache = writeCacheLevel; + T[] array; + + if (cache != -1) + { + // Write cache is correct array only if index is within [arrayCapacity, 2*arrayCapacity). + if (cache == baseAddr) + { + // Return location. + if (ReturnPhysicalAddress) + return ((long)pointers[baseAddr]) + (long)offset * RecordSize; + else + return index; + } + } + + // Write cache did not work, so get level information from index. + // int level = GetLevelFromIndex(index); + + // Spin-wait until level has an allocated array. + var spinner = new SpinWait(); + while (true) + { + array = values[baseAddr]; + if (array != null) + { + break; + } + spinner.SpinOnce(); + } + + // Perform extra actions if inserting at offset 0 of level. + if (offset == 0) + { + // Update write cache to point to current level. + writeCacheLevel = baseAddr; + Interlocked.MemoryBarrier(); + + // Allocate for next page + int newBaseAddr = baseAddr + 1; + var tmp = new T[PageSize]; + +#if !(CALLOC) + Array.Clear(tmp, 0, PageSize); +#endif + + if (IsPinned) + { + handles[newBaseAddr] = GCHandle.Alloc(tmp, GCHandleType.Pinned); + pointers[newBaseAddr] = handles[newBaseAddr].AddrOfPinnedObject(); + } + values[newBaseAddr] = tmp; + + Interlocked.MemoryBarrier(); + } + + // Return location. + if (ReturnPhysicalAddress) + return ((long)pointers[baseAddr]) + (long)offset * RecordSize; + else + return index; + } + + public void Dispose() + { + for (int i = 0; i < values.Length; i++) + { + if (IsPinned && (handles[i].IsAllocated)) handles[i].Free(); + values[i] = null; + } + handles = null; + pointers = null; + values = null; + values0 = null; + count = 0; + } + + public int GetMaxAllocated() + { + return count; + } + + #region Checkpoint + + // Public facing persistence API + public void TakeCheckpoint(IDevice device, out ulong numBytes) + { + begin_checkpoint(device, 0UL, out numBytes); + } + + public bool IsCheckpointCompleted(bool waitUntilComplete = false) + { + bool completed = checkpointEvent.IsSet; + if (!completed && waitUntilComplete) + { + checkpointEvent.Wait(); + return true; + } + return completed; + } + + // Implementation of an asynchronous checkpointing scheme + protected CountdownEvent checkpointEvent; + + internal void begin_checkpoint(IDevice device, ulong offset, out ulong numBytesWritten) + { + int localCount = count; + int recordsCountInLastLevel = localCount & PageSizeMask; + int numCompleteLevels = localCount >> PageSizeBits; + int numLevels = numCompleteLevels + (recordsCountInLastLevel > 0 ? 1 : 0); + checkpointEvent = new CountdownEvent(numLevels); + + uint alignedPageSize = PageSize * (uint)RecordSize; + uint lastLevelSize = (uint)recordsCountInLastLevel * (uint)RecordSize; + + numBytesWritten = 0; + for (int i = 0; i < numLevels; i++) + { + OverflowPagesFlushAsyncResult result = default(OverflowPagesFlushAsyncResult); + device.WriteAsync(pointers[i], offset + numBytesWritten, alignedPageSize, async_flush_callback, result); + numBytesWritten += (i == numCompleteLevels) ? lastLevelSize : alignedPageSize; + } + } + + private void async_flush_callback(uint errorCode, uint numBytes, NativeOverlapped* overlap) + { + try + { + if (errorCode != 0) + { + System.Diagnostics.Trace.TraceError("OverlappedStream GetQueuedCompletionStatus error: {0}", errorCode); + } + } + catch (Exception ex) + { + System.Diagnostics.Trace.TraceError("Completion Callback error, {0}", ex.Message); + } + finally + { + checkpointEvent.Signal(); + } + } + + public int GetMaxValidAddress() + { + return count; + } + #endregion + + #region Recover + public void Recover(string filename, int buckets, ulong numBytes) + { + Recover(new LocalStorageDevice(filename, false, false, true), buckets, numBytes); + } + + public void Recover(IDevice device, int buckets, ulong numBytes) + { + begin_recovery(device, 0UL, buckets, numBytes, out ulong numBytesRead); + } + + public bool IsRecoveryCompleted(bool waitUntilComplete = false) + { + bool completed = (numLevelsToBeRecovered == 0); + if (!completed && waitUntilComplete) + { + while (numLevelsToBeRecovered != 0) + { + Thread.Sleep(10); + } + } + return completed; + } + + // Implementation of asynchronous recovery + private int numLevelsToBeRecovered; + + internal void begin_recovery(IDevice device, + ulong offset, + int buckets, + ulong numBytesToRead, + out ulong numBytesRead) + { + // Allocate as many records in memory + while (count < buckets) + { + Allocate(); + } + + int numRecords = (int)numBytesToRead / RecordSize; + int recordsCountInLastLevel = numRecords & PageSizeMask; + int numCompleteLevels = numRecords >> PageSizeBits; + int numLevels = numCompleteLevels + (recordsCountInLastLevel > 0 ? 1 : 0); + + numLevelsToBeRecovered = numLevels; + + numBytesRead = 0; + uint alignedPageSize = (uint)PageSize * (uint)RecordSize; + uint lastLevelSize = (uint)recordsCountInLastLevel * (uint)RecordSize; + for (int i = 0; i < numLevels; i++) + { + //read a full page + uint length = (uint)PageSize * (uint)RecordSize; ; + OverflowPagesReadAsyncResult result = default(OverflowPagesReadAsyncResult); + device.ReadAsync(offset + numBytesRead, pointers[i], length, async_page_read_callback, result); + numBytesRead += (i == numCompleteLevels) ? lastLevelSize : alignedPageSize; + } + } + + private void async_page_read_callback( + uint errorCode, + uint numBytes, + NativeOverlapped* overlap) + { + try + { + if (errorCode != 0) + { + System.Diagnostics.Trace.TraceError("OverlappedStream GetQueuedCompletionStatus error: {0}", errorCode); + } + } + catch (Exception ex) + { + System.Diagnostics.Trace.TraceError("Completion Callback error, {0}", ex.Message); + } + finally + { + Interlocked.Decrement(ref numLevelsToBeRecovered); + } + } + #endregion + } + + public struct FreeItem + { + public long removed_item; + public int removal_epoch; + } +} diff --git a/cs/src/core/Allocator/PersistentMemoryMalloc.cs b/cs/src/core/Allocator/PersistentMemoryMalloc.cs new file mode 100644 index 000000000..e6fc06234 --- /dev/null +++ b/cs/src/core/Allocator/PersistentMemoryMalloc.cs @@ -0,0 +1,899 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#define CALLOC +using System; +using System.Runtime.CompilerServices; +using System.Threading; +using System.Runtime.InteropServices; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Linq.Expressions; +using System.IO; +using System.Diagnostics; + +namespace FASTER.core +{ + + public enum FlushStatus : int { Flushed, InProgress }; + + public enum CloseStatus : int { Closed, Open }; + + + public struct FullPageStatus + { + public long LastFlushedUntilAddress; + public FlushCloseStatus PageFlushCloseStatus; + } + + [StructLayout(LayoutKind.Explicit)] + public struct FlushCloseStatus + { + [FieldOffset(0)] + public FlushStatus PageFlushStatus; + [FieldOffset(4)] + public CloseStatus PageCloseStatus; + [FieldOffset(0)] + public long value; + } + + [StructLayout(LayoutKind.Explicit)] + internal struct PageOffset + { + [FieldOffset(0)] + public int Offset; + [FieldOffset(4)] + public int Page; + [FieldOffset(0)] + public long PageAndOffset; + } + + public unsafe partial class PersistentMemoryMalloc : IAllocator + { + // Epoch information + public LightEpoch epoch; + + // Read buffer pool + NativeSectorAlignedBufferPool readBufferPool; + + // Record size and pinning + private readonly bool IsPinned; + private const int PrivateRecordSize = 1; + private static bool ForceUnpinnedAllocation = false; + + private readonly IDevice device; + private readonly ISegmentedDevice objlogDevice; + private readonly int sectorSize; + + // Page size + private const int LogPageSizeBits = 25; + private const int PageSize = 1 << LogPageSizeBits; + private const int PageSizeMask = PageSize - 1; + private readonly int AlignedPageSizeBytes; + + // Segment size + private const int LogSegmentSizeBits = 30; + private const long SegmentSize = 1 << LogSegmentSizeBits; + private const long SegmentSizeMask = SegmentSize - 1; + private const int SegmentBufferSize = 1 + + (LogTotalSizeBytes / SegmentSize < 1 ? 1 : (int)(LogTotalSizeBytes / SegmentSize)); + + // Total HLOG size + private const long LogTotalSizeBytes = 1L << 34; // 29 + private const int BufferSize = (int)(LogTotalSizeBytes / (1L << LogPageSizeBits)); + + // HeadOffset lag (from tail) + private const int HeadOffsetLagNumPages = 4; + private const int HeadOffsetLagSize = BufferSize - HeadOffsetLagNumPages; + private const long HeadOffsetLagAddress = (long)HeadOffsetLagSize << LogPageSizeBits; + + // ReadOnlyOffset lag (from tail) + public const double LogMutableFraction = 0.9; + public const long ReadOnlyLagAddress = (long)(LogMutableFraction * BufferSize) << LogPageSizeBits; + + // Circular buffer definition + private T[][] values = new T[BufferSize][]; + private GCHandle[] handles = new GCHandle[BufferSize]; + private IntPtr[] pointers = new IntPtr[BufferSize]; + private GCHandle ptrHandle; + private long* nativePointers; + + // Array that indicates the status of each buffer page + private FullPageStatus[] PageStatusIndicator = new FullPageStatus[BufferSize]; + + NativeSectorAlignedBufferPool ioBufferPool; + + // Index in circular buffer, of the current tail page + private volatile int TailPageIndex; + + // Global address of the current tail (next element to be allocated from the circular buffer) + private PageOffset TailPageOffset; + + public long ReadOnlyAddress; + + public long SafeReadOnlyAddress; + + public long HeadAddress; + + public long SafeHeadAddress; + + public long FlushedUntilAddress; + + public long BeginAddress; + + /// + /// The smallest record size that can be allotted + /// + public int RecordSize + { + get + { + return PrivateRecordSize; + } + } + + public PersistentMemoryMalloc(IDevice device) : this(device, 0) + { + Allocate(Constants.kFirstValidAddress); // null pointer + ReadOnlyAddress = GetTailAddress(); + SafeReadOnlyAddress = ReadOnlyAddress; + HeadAddress = ReadOnlyAddress; + SafeHeadAddress = ReadOnlyAddress; + BeginAddress = ReadOnlyAddress; + } + + public PersistentMemoryMalloc(IDevice device, long startAddress) + { + // Console.WriteLine("Total memory (GB) = " + totalSize/1000000000); + // Console.WriteLine("BufferSize = " + BufferSize); + // Console.WriteLine("ReadOnlyLag = " + (ReadOnlyLagAddress >> PageSizeBits)); + + if (BufferSize < 16) + { + throw new Exception("HLOG buffer must be at least 16 pages"); + } + + this.device = device; + + objlogDevice = CreateObjectLogDevice(device); + + sectorSize = (int)device.GetSectorSize(); + epoch = LightEpoch.Instance; + ioBufferPool = new NativeSectorAlignedBufferPool(1, sectorSize); + + if (ForceUnpinnedAllocation) + { + IsPinned = false; + } + else + { + IsPinned = true; + try + { + var tmp = new T[1]; + var h = GCHandle.Alloc(tmp, GCHandleType.Pinned); + var p = h.AddrOfPinnedObject(); + //PrivateRecordSize = Marshal.SizeOf(tmp[0]); + AlignedPageSizeBytes = (((PrivateRecordSize * PageSize) + (sectorSize - 1)) & ~(sectorSize - 1)); + } + catch (Exception) + { + IsPinned = false; + } + } + + ptrHandle = GCHandle.Alloc(pointers, GCHandleType.Pinned); + nativePointers = (long*)ptrHandle.AddrOfPinnedObject(); + + Initialize(startAddress); + } + + public int GetSectorSize() + { + return sectorSize; + } + + public void Initialize(long startAddress) + { + readBufferPool = new NativeSectorAlignedBufferPool(PrivateRecordSize, sectorSize); + long tailPage = startAddress >> LogPageSizeBits; + int tailPageIndex = (int)(tailPage % BufferSize); + + AllocatePage(tailPageIndex); + + SafeReadOnlyAddress = startAddress; + ReadOnlyAddress = startAddress; + SafeHeadAddress = startAddress; + HeadAddress = startAddress; + FlushedUntilAddress = startAddress; + BeginAddress = startAddress; + + TailPageOffset.Page = (int)(startAddress >> LogPageSizeBits); + TailPageOffset.Offset = (int)(startAddress & PageSizeMask); + + TailPageIndex = -1; + + //Handle the case when startAddress + pageSize overflows + //onto the next pageIndex in our buffer pages array + if (0 != (startAddress & PageSizeMask)) + { + // Update write cache to point to current level. + TailPageIndex = tailPageIndex; + Interlocked.MemoryBarrier(); + + // Allocate for next page + int newPageIndex = (tailPageIndex + 1) % BufferSize; + AllocatePage(newPageIndex); + } + } + + /// + /// Dispose memory allocator + /// + public void Free() + { + for (int i = 0; i < values.Length; i++) + { + if (IsPinned && (handles[i].IsAllocated)) handles[i].Free(); + values[i] = null; + PageStatusIndicator[i].PageFlushCloseStatus = new FlushCloseStatus { PageFlushStatus = FlushStatus.Flushed, PageCloseStatus = CloseStatus.Closed }; + } + handles = null; + pointers = null; + values = null; + TailPageOffset.Page = 0; + TailPageOffset.Offset = 0; + SafeReadOnlyAddress = 0; + ReadOnlyAddress = 0; + SafeHeadAddress = 0; + HeadAddress = 0; + BeginAddress = 1; + } + + public long GetTailAddress() + { + var local = TailPageOffset; + return ((long)local.Page << LogPageSizeBits) | (uint)local.Offset; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public T Get(long index) + { + if (this.IsPinned) + throw new Exception("Physical pointer returned by allocator: de-reference pointer to get records instead of calling Get"); + + return values + [index >> LogPageSizeBits] + [index & PageSizeMask]; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Set(long index, ref T value) + { + if (this.IsPinned) + throw new Exception("Physical pointer returned by allocator: de-reference pointer to set records instead of calling Set (otherwise, set ForceUnpinnedAllocation to true)"); + + values + [index >> LogPageSizeBits] + [index & PageSizeMask] + = value; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Set(long index, T value) + { + Set(index, ref value); + } + +#if USEFREELIST + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void FreeAtEpoch(long pointer, int removed_epoch) + { + if (freeList == null) freeList = new Queue(); + freeList.Enqueue(new FreeItem { removed_item = pointer, removal_epoch = removed_epoch }); + } + +#if DEBUG + public long TotalFreeCount() + { + long result = 0; + var x = allQueues.ToArray(); + foreach (var q in x) + { + result += q.Count; + } + return result; + } + + public long TotalUsedPointers() + { + return TailAddress - TotalFreeCount(); + } + +#endif +#endif + //Simple Accessor Functions + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public long GetPage(long logicalAddress) + { + return (logicalAddress >> LogPageSizeBits); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public int GetPageIndexForPage(long page) + { + return (int)(page % BufferSize); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public int GetPageIndexForAddress(long address) + { + return (int)((address >> LogPageSizeBits) % BufferSize); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public int GetCapacityNumPages() + { + return BufferSize; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public long GetStartLogicalAddress(long page) + { + return page << LogPageSizeBits; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public long GetPageSize() + { + return PageSize; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public long GetOffsetInPage(long address) + { + return address & PageSizeMask; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public long GetHeadOffsetLagInPages() + { + return HeadOffsetLagSize; + } + + /// + /// Used to obtain the physical address corresponding to a logical address + /// + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public long GetPhysicalAddress(long logicalAddress) + { + // Offset within page + int offset = (int)(logicalAddress & PageSizeMask); + + // Global page address + long page = (logicalAddress >> LogPageSizeBits); + + // Index of page within the circular buffer + int pageIndex = (int)(page % BufferSize); + + return (*(nativePointers+pageIndex)) + offset*PrivateRecordSize; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public long GetPhysicalAddressInternal(long logicalAddress) + { + // Offset within page + int offset = (int)(logicalAddress & PageSizeMask); + + // Global page address + long page = (logicalAddress >> LogPageSizeBits); + + // Index of page within the circular buffer + int pageIndex = (int)(page % BufferSize); + + return (*(nativePointers + pageIndex)) + offset * PrivateRecordSize; + } + + /// + /// Key function used to allocate memory for a specified number of items + /// + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public long Allocate(int numSlots = 1) + { + PageOffset localTailPageOffset = default(PageOffset); + + // Determine insertion index. + // ReSharper disable once CSharpWarnings::CS0420 +#pragma warning disable 420 + localTailPageOffset.PageAndOffset = Interlocked.Add(ref TailPageOffset.PageAndOffset, numSlots); +#pragma warning restore 420 + + int page = localTailPageOffset.Page; + int offset = localTailPageOffset.Offset - numSlots; + +#region HANDLE PAGE OVERFLOW + /* To prove correctness of the following modifications + * done to TailPageOffset and the allocation itself, + * we should use the fact that only one thread will have any + * of the following cases since it is a counter and we spin-wait + * until the tail is folded onto next page accordingly. + */ + if (localTailPageOffset.Offset >= PageSize) + { + if (offset >= PageSize) + { + //The tail offset value was more than page size before atomic add + //We consider that a failed attempt and retry again + var spin = new SpinWait(); + do + { + //Just to give some more time to the thread + // that is handling this overflow + while (TailPageOffset.Offset >= PageSize) + { + spin.SpinOnce(); + } + + // ReSharper disable once CSharpWarnings::CS0420 +#pragma warning disable 420 + localTailPageOffset.PageAndOffset = Interlocked.Add(ref TailPageOffset.PageAndOffset, numSlots); +#pragma warning restore 420 + + page = localTailPageOffset.Page; + offset = localTailPageOffset.Offset - numSlots; + } while (offset >= PageSize); + } + + + if (localTailPageOffset.Offset == PageSize) + { + //Folding over at page boundary + localTailPageOffset.Page++; + localTailPageOffset.Offset = 0; + TailPageOffset = localTailPageOffset; + } + else if (localTailPageOffset.Offset >= PageSize) + { + //Overflows not allowed. We allot same space in next page. + localTailPageOffset.Page++; + localTailPageOffset.Offset = numSlots; + TailPageOffset = localTailPageOffset; + + page = localTailPageOffset.Page; + offset = 0; + } + } +#endregion + + long address = (((long)page) << LogPageSizeBits) | ((long)offset); + + // Check if TailPageIndex is appropriate and allocated! + int pageIndex = page % BufferSize; + + /* + if (pageIndex == 0 && page != 0) + { + Debugger.Break(); + }*/ + if (TailPageIndex == pageIndex) + { + return (address); + } + + //Invert the address if either the previous page is not flushed or if it is null + if ((PageStatusIndicator[pageIndex].PageFlushCloseStatus.PageFlushStatus != FlushStatus.Flushed) || + (PageStatusIndicator[pageIndex].PageFlushCloseStatus.PageCloseStatus != CloseStatus.Closed) || + (values[pageIndex] == null)) + { + address = -address; + } + + // Update the read-only so that we can get more space for the tail + if (offset == 0) + { + if (address >= 0) + { + TailPageIndex = pageIndex; + Interlocked.MemoryBarrier(); + } + + long newPage = page + 1; + int newPageIndex = (int)((page + 1) % BufferSize); + + long tailAddress = (address < 0 ? -address : address); + PageAlignedShiftReadOnlyAddress(tailAddress); + PageAlignedShiftHeadAddress(tailAddress); + + if (values[newPageIndex] == null) + { + AllocatePage(newPageIndex); + } + } + + return (address); + } + + /// + /// If allocator cannot allocate new memory as the head has not shifted or the previous page + /// is not yet closed, it allocates but returns the negative address. + /// This function is invoked to check if the address previously allocated has become valid to be used + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void CheckForAllocateComplete(ref long address) + { + if (address >= 0) + { + throw new Exception("Address already allocated!"); + } + + PageOffset p = default(PageOffset); + p.Page = (int)((-address) >> LogPageSizeBits); + p.Offset = (int)((-address) & PageSizeMask); + + //Check write cache + int pageIndex = p.Page % BufferSize; + if (TailPageIndex == pageIndex) + { + address = -address; + return; + } + + //Check if we can move the head offset + long currentTailAddress = GetTailAddress(); + PageAlignedShiftHeadAddress(currentTailAddress); + + //Check if I can allocate pageIndex at all + if ((PageStatusIndicator[pageIndex].PageFlushCloseStatus.PageFlushStatus != FlushStatus.Flushed) || + (PageStatusIndicator[pageIndex].PageFlushCloseStatus.PageCloseStatus != CloseStatus.Closed) || + (values[pageIndex] == null)) + { + return; + } + + //correct values and set write cache + address = -address; + if (p.Offset == 0) + { + TailPageIndex = pageIndex; + } + return; + } + + /// + /// Used by applications to make the current state of the database immutable quickly + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void ShiftReadOnlyToTail(out long tailAddress) + { + tailAddress = GetTailAddress(); + long localTailAddress = tailAddress; + long currentReadOnlyOffset = ReadOnlyAddress; + if (MonotonicUpdate(ref ReadOnlyAddress, tailAddress, out long oldReadOnlyOffset)) + { + epoch.BumpCurrentEpoch(() => OnPagesMarkedReadOnly(localTailAddress, false)); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void ShiftBeginAddress(long oldBeginAddress, long newBeginAddress) + { + epoch.BumpCurrentEpoch(() => + { + device.DeleteAddressRange(oldBeginAddress, newBeginAddress); + objlogDevice.DeleteSegmentRange((int)(oldBeginAddress >> LogSegmentSizeBits), (int)(newBeginAddress >> LogSegmentSizeBits)); + }); + } + + /// + /// Checks if until address has been flushed! + /// + /// + /// + public bool CheckFlushedUntil(long address) + { + return FlushedUntilAddress >= address; + } + + public void KillFuzzyRegion() + { + while (SafeReadOnlyAddress != ReadOnlyAddress) + { + Interlocked.CompareExchange(ref SafeReadOnlyAddress, + ReadOnlyAddress, + SafeReadOnlyAddress); + } + } + + /// + /// Seal: make sure there are no longer any threads writing to the page + /// Flush: send page to secondary store + /// + /// + /// + public void OnPagesMarkedReadOnly(long newSafeReadOnlyAddress, bool waitForPendingFlushComplete = false) + { + if(MonotonicUpdate(ref SafeReadOnlyAddress, newSafeReadOnlyAddress, out long oldSafeReadOnlyAddress)) + { + Debug.WriteLine("SafeReadOnly shifted from {0:X} to {1:X}", oldSafeReadOnlyAddress, newSafeReadOnlyAddress); + long startPage = oldSafeReadOnlyAddress >> LogPageSizeBits; + + long endPage = (newSafeReadOnlyAddress >> LogPageSizeBits); + int numPages = (int)(endPage - startPage); + if (numPages > 10) + { + new Thread( + () => AsyncFlushPages(startPage, newSafeReadOnlyAddress)).Start(); + } + else + { + AsyncFlushPages(startPage, newSafeReadOnlyAddress); + } + } + } + + /// + /// Action to be performed for when all threads have agreed that a page range is closed. + /// + /// + /// + /// + public void OnPagesClosed(long newSafeHeadAddress, bool replaceWithCleanPage = false) + { + if (MonotonicUpdate(ref SafeHeadAddress, newSafeHeadAddress, out long oldSafeHeadAddress)) + { + Debug.WriteLine("SafeHeadOffset shifted from {0:X} to {1:X}", oldSafeHeadAddress, newSafeHeadAddress); + + for (long closePageAddress = oldSafeHeadAddress; closePageAddress < newSafeHeadAddress; closePageAddress += PageSize) + { + int closePage = (int)((closePageAddress >> LogPageSizeBits) % BufferSize); + + if (replaceWithCleanPage) + { + if (values[closePage] == null) + { + // Allocate a new page + AllocatePage(closePage); + } + else + { + //Clear an old used page + // BUG: we cannot clear because the + // page may not be flushed. + // Array.Clear(values[closePage], 0, values[closePage].Length); + } + } + else + { + values[closePage] = null; + } + + while (true) + { + var oldStatus = PageStatusIndicator[closePage].PageFlushCloseStatus; + if (oldStatus.PageFlushStatus == FlushStatus.Flushed) + { + ClearPage(closePage, (closePageAddress >> LogPageSizeBits) == 0); + + var thisCloseSegment = closePageAddress >> LogSegmentSizeBits; + var nextClosePage = (closePageAddress >> LogPageSizeBits) + 1; + var nextCloseSegment = nextClosePage >> (LogSegmentSizeBits - LogPageSizeBits); + + if (thisCloseSegment != nextCloseSegment) + { + // Last page in current segment + segmentOffsets[thisCloseSegment % SegmentBufferSize] = 0; + } + } + else + { + throw new Exception("Impossible"); + } + var newStatus = oldStatus; + newStatus.PageCloseStatus = CloseStatus.Closed; + if (oldStatus.value == Interlocked.CompareExchange(ref PageStatusIndicator[closePage].PageFlushCloseStatus.value, newStatus.value, oldStatus.value)) + { + break; + } + } + + //Necessary to propagate this change to other threads + Interlocked.MemoryBarrier(); + } + } + } + + private void ClearPage(int page, bool pageZero) + { + if (Key.HasObjectsToSerialize() || Value.HasObjectsToSerialize()) + { + long ptr = (long)pointers[page]; + int numBytes = PageSize * PrivateRecordSize; + long endptr = ptr + numBytes; + + if (pageZero) ptr += Constants.kFirstValidAddress; + + List addr = new List(); + while (ptr < endptr) + { + if (!Layout.GetInfo(ptr)->Invalid) + { + if (Key.HasObjectsToSerialize()) + { + Key* key = Layout.GetKey(ptr); + Key.Free(key); + } + if (Value.HasObjectsToSerialize()) + { + Value* value = Layout.GetValue(ptr); + Value.Free(value); + } + } + ptr += Layout.GetPhysicalSize(ptr); + } + } + Array.Clear(values[page], 0, values[page].Length); + } + + + /// + /// Allocate memory page, pinned in memory, and in sector aligned form, if possible + /// + /// + private void AllocatePage(int index, bool clear = false) + { + if (IsPinned) + { + var adjustedSize = PageSize + (int)Math.Ceiling(2 * sectorSize / PrivateRecordSize * 1.0); + T[] tmp = new T[adjustedSize]; + if (clear) + { + Array.Clear(tmp, 0, adjustedSize); + } + else + { +#if !(CALLOC) + Array.Clear(tmp, 0, adjustedSize); +#endif + } + + handles[index] = GCHandle.Alloc(tmp, GCHandleType.Pinned); + long p = (long)handles[index].AddrOfPinnedObject(); + pointers[index] = (IntPtr)((p + (sectorSize - 1)) & ~(sectorSize - 1)); + values[index] = tmp; + } + else + { + T[] tmp = new T[PageSize]; +#if !(CALLOC) + Array.Clear(tmp, 0, tmp.Length); +#endif + values[index] = tmp; + } + + PageStatusIndicator[index].PageFlushCloseStatus.PageFlushStatus = FlushStatus.Flushed; + PageStatusIndicator[index].PageFlushCloseStatus.PageCloseStatus = CloseStatus.Closed; + Interlocked.MemoryBarrier(); + } + + /// + /// Called every time a new tail page is allocated. Here the read-only is + /// shifted only to page boundaries unlike ShiftReadOnlyToTail where shifting + /// can happen to any fine-grained address. + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void PageAlignedShiftReadOnlyAddress(long currentTailAddress) + { + long currentReadOnlyAddress = ReadOnlyAddress; + long pageAlignedTailAddress = currentTailAddress & ~PageSizeMask; + long desiredReadOnlyAddress = (pageAlignedTailAddress - ReadOnlyLagAddress); + if (MonotonicUpdate(ref ReadOnlyAddress, desiredReadOnlyAddress, out long oldReadOnlyAddress)) + { + if (oldReadOnlyAddress == 0) + Console.WriteLine("Going read-only"); + /* + for (int i = (int)(oldReadOnlyAddress >> LogPageSizeBits); i < (int)(desiredReadOnlyAddress >> LogPageSizeBits); i++) + { + //Set status to in-progress + PageStatusIndicator[i % BufferSize].PageFlushCloseStatus + = new FlushCloseStatus { PageFlushStatus = FlushStatus.InProgress, PageCloseStatus = CloseStatus.Open }; + PageStatusIndicator[i % BufferSize].LastFlushedUntilAddress = -1; + } + */ + epoch.BumpCurrentEpoch(() => OnPagesMarkedReadOnly(desiredReadOnlyAddress)); + } + } + + /// + /// Called whenever a new tail page is allocated or when the user is checking for a failed memory allocation + /// Tries to shift head address based on the head offset lag size. + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void PageAlignedShiftHeadAddress(long currentTailAddress) + { + //obtain local values of variables that can change + long currentHeadAddress = HeadAddress; + long currentFlushedUntilAddress = FlushedUntilAddress; + long pageAlignedTailAddress = currentTailAddress & ~PageSizeMask; + long desiredHeadAddress = (pageAlignedTailAddress - HeadOffsetLagAddress); + + long newHeadAddress = desiredHeadAddress; + if(currentFlushedUntilAddress < newHeadAddress) + { + newHeadAddress = currentFlushedUntilAddress; + } + newHeadAddress = newHeadAddress & ~PageSizeMask; + + if (MonotonicUpdate(ref HeadAddress, newHeadAddress, out long oldHeadAddress)) + { + if (oldHeadAddress == 0) + Console.WriteLine("Going external memory"); + + Debug.WriteLine("Allocate: Moving head offset from {0:X} to {1:X}", oldHeadAddress, newHeadAddress); + epoch.BumpCurrentEpoch(() => OnPagesClosed(newHeadAddress, true)); + } + } + + /// + /// Every async flush callback tries to update the flushed until address to the latest value possible + /// Is there a better way to do this with enabling fine-grained addresses (not necessarily at page boundaries)? + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void ShiftFlushedUntilAddress() + { + long currentFlushedUntilAddress = FlushedUntilAddress; + long page = GetPage(currentFlushedUntilAddress); + + bool update = false; + long pageLastFlushedAddress = PageStatusIndicator[(int)(page % BufferSize)].LastFlushedUntilAddress; + while (pageLastFlushedAddress >= currentFlushedUntilAddress) + { + currentFlushedUntilAddress = pageLastFlushedAddress; + update = true; + page++; + pageLastFlushedAddress = PageStatusIndicator[(int)(page % BufferSize)].LastFlushedUntilAddress; + } + + if(update) + { + bool success = MonotonicUpdate(ref FlushedUntilAddress, currentFlushedUntilAddress, out long oldFlushedUntilAddress); + if (success) + { + } + } + } + + + + /// + /// Used by several functions to update the variable to newValue. Ignores if newValue is smaller or + /// than the current value. + /// + /// + /// + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool MonotonicUpdate(ref long variable, long newValue, out long oldValue) + { + oldValue = variable; + while (oldValue < newValue) + { + var foundValue = Interlocked.CompareExchange(ref variable, newValue, oldValue); + if (foundValue == oldValue) + { + return true; + } + oldValue = foundValue; + } + return false; + } + } +} diff --git a/cs/src/core/Codegen/CompilerBase.cs b/cs/src/core/Codegen/CompilerBase.cs new file mode 100644 index 000000000..843ec32d5 --- /dev/null +++ b/cs/src/core/Codegen/CompilerBase.cs @@ -0,0 +1,228 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using Microsoft.CodeAnalysis; +using Microsoft.CodeAnalysis.CSharp; +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Reflection; +using System.Runtime.CompilerServices; +using System.Text; +using static FASTER.core.Roslyn.Helper; + +namespace FASTER.core.Roslyn +{ + class CompilerBase + { + protected CSharpCompilation compilation; + protected Dictionary metadataReferences = new Dictionary(); + protected IEnumerable