From 60164ff6759e630bb490114bceacd0a11194ce67 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Sat, 23 Mar 2024 12:10:27 +0000 Subject: [PATCH 01/11] pageserver: drop the layer map lock after planning reads --- pageserver/src/tenant/ephemeral_file.rs | 4 + pageserver/src/tenant/layer_map.rs | 31 +--- pageserver/src/tenant/storage_layer.rs | 161 ++++++++---------- .../tenant/storage_layer/inmemory_layer.rs | 10 ++ pageserver/src/tenant/timeline.rs | 46 +++-- 5 files changed, 114 insertions(+), 138 deletions(-) diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index e48b9e83bd63..51c11341efd2 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -72,6 +72,10 @@ impl EphemeralFile { self.len } + pub(crate) fn path(&self) -> Utf8PathBuf { + self.file.path.clone() + } + pub(crate) async fn read_blk( &self, blknum: u32, diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index b8ed69052f5e..892f1d9e4bb5 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -576,41 +576,18 @@ impl LayerMap { self.historic.iter() } - /// Get a handle for the first in memory layer that matches the provided predicate. - /// The handle should be used with [`Self::get_in_memory_layer`] to retrieve the actual layer. - /// - /// Note: [`Self::find_in_memory_layer`] and [`Self::get_in_memory_layer`] should be called during - /// the same exclusive region established by holding the layer manager lock. - pub fn find_in_memory_layer(&self, mut pred: Pred) -> Option + /// Get a ref counted pointer for the first in memory layer that matches the provided predicate. + pub fn find_in_memory_layer(&self, mut pred: Pred) -> Option> where Pred: FnMut(&Arc) -> bool, { if let Some(open) = &self.open_layer { if pred(open) { - return Some(InMemoryLayerHandle::Open { - lsn_floor: open.get_lsn_range().start, - end_lsn: open.get_lsn_range().end, - }); + return Some(open.clone()); } } - let pos = self.frozen_layers.iter().rev().position(pred); - pos.map(|rev_idx| { - let idx = self.frozen_layers.len() - 1 - rev_idx; - InMemoryLayerHandle::Frozen { - idx, - lsn_floor: self.frozen_layers[idx].get_lsn_range().start, - end_lsn: self.frozen_layers[idx].get_lsn_range().end, - } - }) - } - - /// Get the layer pointed to by the provided handle. - pub fn get_in_memory_layer(&self, handle: &InMemoryLayerHandle) -> Option> { - match handle { - InMemoryLayerHandle::Open { .. } => self.open_layer.clone(), - InMemoryLayerHandle::Frozen { idx, .. } => self.frozen_layers.get(*idx).cloned(), - } + self.frozen_layers.iter().rfind(|l| pred(l)).cloned() } /// diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 5c3bab986888..5f7e9f04e883 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -25,7 +25,7 @@ use std::cmp::{Ordering, Reverse}; use std::collections::hash_map::Entry; use std::collections::{BinaryHeap, HashMap}; use std::ops::Range; -use std::sync::Mutex; +use std::sync::{Arc, Mutex}; use std::time::{Duration, SystemTime, UNIX_EPOCH}; use tracing::warn; use utils::history_buffer::HistoryBufferWithDropCounter; @@ -41,8 +41,8 @@ pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey}; pub(crate) use layer::{EvictionError, Layer, ResidentLayer}; -use super::layer_map::InMemoryLayerHandle; -use super::timeline::layer_manager::LayerManager; +use self::inmemory_layer::InMemoryLayerKey; + use super::timeline::GetVectoredError; use super::PageReconstructError; @@ -204,23 +204,48 @@ impl Default for ValuesReconstructState { } } -/// Description of layer to be read - the layer map can turn -/// this description into the actual layer. -#[derive(PartialEq, Eq, Hash, Debug, Clone)] -pub(crate) enum ReadableLayerDesc { - Persistent { - desc: PersistentLayerDesc, - lsn_range: Range, - }, - InMemory { - handle: InMemoryLayerHandle, - lsn_ceil: Lsn, - }, +#[derive(Debug, PartialEq, Eq, Clone, Hash)] +pub(crate) enum LayerKey { + PersitentLayerKey(PersistentLayerKey), + InMemoryLayerKey(InMemoryLayerKey), } -/// Wraper for 'ReadableLayerDesc' sorted by Lsn #[derive(Debug)] -struct ReadableLayerDescOrdered(ReadableLayerDesc); +pub(crate) enum ReadableLayer { + PersistentLayer(Layer), + InMemoryLayer(Arc), +} + +#[derive(Debug, Clone)] +pub(crate) struct ReadDesc { + layer_key: LayerKey, + lsn_range: Range, +} + +impl Ord for ReadDesc { + fn cmp(&self, other: &Self) -> Ordering { + let ord = self.lsn_range.end.cmp(&other.lsn_range.end); + if ord == std::cmp::Ordering::Equal { + self.lsn_range.start.cmp(&other.lsn_range.start).reverse() + } else { + ord + } + } +} + +impl PartialOrd for ReadDesc { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl PartialEq for ReadDesc { + fn eq(&self, other: &Self) -> bool { + self.lsn_range == other.lsn_range + } +} + +impl Eq for ReadDesc {} /// Data structure which maintains a fringe of layers for the /// read path. The fringe is the set of layers which intersects @@ -231,41 +256,50 @@ struct ReadableLayerDescOrdered(ReadableLayerDesc); /// a two layer indexing scheme. #[derive(Debug)] pub(crate) struct LayerFringe { - layers_by_lsn: BinaryHeap, - layers: HashMap, + // TODO: rename members + reads_by_lsn: BinaryHeap, + layers: HashMap, } impl LayerFringe { pub(crate) fn new() -> Self { LayerFringe { - layers_by_lsn: BinaryHeap::new(), + reads_by_lsn: BinaryHeap::new(), layers: HashMap::new(), } } - pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayerDesc, KeySpace)> { - let handle = match self.layers_by_lsn.pop() { - Some(h) => h, + pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayer, KeySpace, Range)> { + let read_desc = match self.reads_by_lsn.pop() { + Some(desc) => desc, None => return None, }; - let removed = self.layers.remove_entry(&handle.0); + let removed = self.layers.remove_entry(&read_desc.layer_key); match removed { - Some((layer, keyspace)) => Some((layer, keyspace)), + Some((_, (layer, keyspace))) => Some((layer, keyspace, read_desc.lsn_range)), None => unreachable!("fringe internals are always consistent"), } } - pub(crate) fn update(&mut self, layer: ReadableLayerDesc, keyspace: KeySpace) { - let entry = self.layers.entry(layer.clone()); + pub(crate) fn update( + &mut self, + layer: ReadableLayer, + keyspace: KeySpace, + lsn_range: Range, + ) { + let key = layer.key(); + let entry = self.layers.entry(key.clone()); match entry { Entry::Occupied(mut entry) => { - entry.get_mut().merge(&keyspace); + entry.get_mut().1.merge(&keyspace); } Entry::Vacant(entry) => { - self.layers_by_lsn - .push(ReadableLayerDescOrdered(entry.key().clone())); - entry.insert(keyspace); + self.reads_by_lsn.push(ReadDesc { + lsn_range, + layer_key: key.clone(), + }); + entry.insert((layer, keyspace)); } } } @@ -277,77 +311,30 @@ impl Default for LayerFringe { } } -impl Ord for ReadableLayerDescOrdered { - fn cmp(&self, other: &Self) -> Ordering { - let ord = self.0.get_lsn_ceil().cmp(&other.0.get_lsn_ceil()); - if ord == std::cmp::Ordering::Equal { - self.0 - .get_lsn_floor() - .cmp(&other.0.get_lsn_floor()) - .reverse() - } else { - ord - } - } -} - -impl PartialOrd for ReadableLayerDescOrdered { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl PartialEq for ReadableLayerDescOrdered { - fn eq(&self, other: &Self) -> bool { - self.0.get_lsn_floor() == other.0.get_lsn_floor() - && self.0.get_lsn_ceil() == other.0.get_lsn_ceil() - } -} - -impl Eq for ReadableLayerDescOrdered {} - -impl ReadableLayerDesc { - pub(crate) fn get_lsn_floor(&self) -> Lsn { +impl ReadableLayer { + pub(crate) fn key(&self) -> LayerKey { match self { - ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.start, - ReadableLayerDesc::InMemory { handle, .. } => handle.get_lsn_floor(), - } - } - - pub(crate) fn get_lsn_ceil(&self) -> Lsn { - match self { - ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.end, - ReadableLayerDesc::InMemory { lsn_ceil, .. } => *lsn_ceil, + Self::PersistentLayer(layer) => LayerKey::PersitentLayerKey(layer.layer_desc().key()), + Self::InMemoryLayer(layer) => LayerKey::InMemoryLayerKey(layer.key()), } } pub(crate) async fn get_values_reconstruct_data( &self, - layer_manager: &LayerManager, keyspace: KeySpace, + lsn_range: Range, reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result<(), GetVectoredError> { match self { - ReadableLayerDesc::Persistent { desc, lsn_range } => { - let layer = layer_manager.get_from_desc(desc); + ReadableLayer::PersistentLayer(layer) => { layer - .get_values_reconstruct_data( - keyspace, - lsn_range.clone(), - reconstruct_state, - ctx, - ) + .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx) .await } - ReadableLayerDesc::InMemory { handle, lsn_ceil } => { - let layer = layer_manager - .layer_map() - .get_in_memory_layer(handle) - .unwrap(); - + ReadableLayer::InMemoryLayer(layer) => { layer - .get_values_reconstruct_data(keyspace, *lsn_ceil, reconstruct_state, ctx) + .get_values_reconstruct_data(keyspace, lsn_range.end, reconstruct_state, ctx) .await } } diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 5f1db21d493b..b03b23f8a2dd 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -32,10 +32,14 @@ use super::{ ValuesReconstructState, }; +#[derive(Debug, PartialEq, Eq, Clone, Hash)] +pub(crate) struct InMemoryLayerKey(camino::Utf8PathBuf); + pub struct InMemoryLayer { conf: &'static PageServerConf, tenant_shard_id: TenantShardId, timeline_id: TimelineId, + key: InMemoryLayerKey, /// This layer contains all the changes from 'start_lsn'. The /// start is inclusive. @@ -79,6 +83,10 @@ impl std::fmt::Debug for InMemoryLayerInner { } impl InMemoryLayer { + pub(crate) fn key(&self) -> InMemoryLayerKey { + self.key.clone() + } + pub(crate) fn get_timeline_id(&self) -> TimelineId { self.timeline_id } @@ -318,8 +326,10 @@ impl InMemoryLayer { trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}"); let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?; + let key = InMemoryLayerKey(file.path()); Ok(InMemoryLayer { + key, conf, timeline_id, tenant_shard_id, diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 7523130f2343..f3eedc93bf22 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -118,11 +118,11 @@ use self::layer_manager::LayerManager; use self::logical_size::LogicalSize; use self::walreceiver::{WalReceiver, WalReceiverConf}; -use super::remote_timeline_client::RemoteTimelineClient; +use super::config::TenantConf; use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline}; -use super::{config::TenantConf, storage_layer::ReadableLayerDesc}; use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf}; use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe}; +use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer}; #[derive(Debug, PartialEq, Eq, Clone, Copy)] pub(super) enum FlushLoopState { @@ -2771,16 +2771,6 @@ impl Timeline { let mut completed_keyspace = KeySpace::default(); - // Hold the layer map whilst visiting the timeline to prevent - // compaction, eviction and flushes from rendering the layers unreadable. - // - // TODO: Do we actually need to do this? In theory holding on - // to [`tenant::storage_layer::Layer`] should be enough. However, - // [`Timeline::get`] also holds the lock during IO, so more investigation - // is needed. - let guard = timeline.layers.read().await; - let layers = guard.layer_map(); - loop { if cancel.is_cancelled() { return Err(GetVectoredError::Cancelled); @@ -2790,6 +2780,9 @@ impl Timeline { unmapped_keyspace.remove_overlapping_with(&keys_done_last_step); completed_keyspace.merge(&keys_done_last_step); + let guard = timeline.layers.read().await; + let layers = guard.layer_map(); + let in_memory_layer = layers.find_in_memory_layer(|l| { let start_lsn = l.get_lsn_range().start; cont_lsn > start_lsn @@ -2797,12 +2790,11 @@ impl Timeline { match in_memory_layer { Some(l) => { + let lsn_range = l.get_lsn_range().start..cont_lsn; fringe.update( - ReadableLayerDesc::InMemory { - handle: l, - lsn_ceil: cont_lsn, - }, + ReadableLayer::InMemoryLayer(l), unmapped_keyspace.clone(), + lsn_range, ); } None => { @@ -2814,30 +2806,36 @@ impl Timeline { .into_iter() .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| { ( - ReadableLayerDesc::Persistent { - desc: (*layer).clone(), - lsn_range: lsn_floor..cont_lsn, - }, + ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)), keyspace_accum.to_keyspace(), + lsn_floor..cont_lsn, ) }) - .for_each(|(layer, keyspace)| fringe.update(layer, keyspace)); + .for_each(|(layer, keyspace, lsn_range)| { + fringe.update(layer, keyspace, lsn_range) + }); } } } - if let Some((layer_to_read, keyspace_to_read)) = fringe.next_layer() { + // It's safe to drop the layer map lock after planning the next round of reads. + // The fringe keeps readable handles for the layers which are safe to read even + // if layers were compacted or flushed. + drop(guard); + + if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() { + let next_cont_lsn = lsn_range.start; layer_to_read .get_values_reconstruct_data( - &guard, keyspace_to_read.clone(), + lsn_range, reconstruct_state, ctx, ) .await?; unmapped_keyspace = keyspace_to_read; - cont_lsn = layer_to_read.get_lsn_floor(); + cont_lsn = next_cont_lsn; } else { break; } From ba7ffb9750c9abf98277ebf1e985af6619d04242 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Sat, 23 Mar 2024 15:45:06 +0000 Subject: [PATCH 02/11] self-review: comments and tidy-up --- pageserver/src/tenant/storage_layer.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 5f7e9f04e883..8bd613e249a0 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -204,12 +204,16 @@ impl Default for ValuesReconstructState { } } +/// A key that uniquely identifies a layer in a timeline #[derive(Debug, PartialEq, Eq, Clone, Hash)] pub(crate) enum LayerKey { PersitentLayerKey(PersistentLayerKey), InMemoryLayerKey(InMemoryLayerKey), } +/// Layer wrapper for the read path. Note that it is valid +/// to use these layers even after external operations have +/// been performed on them (compaction, freeze, etc.). #[derive(Debug)] pub(crate) enum ReadableLayer { PersistentLayer(Layer), @@ -256,21 +260,20 @@ impl Eq for ReadDesc {} /// a two layer indexing scheme. #[derive(Debug)] pub(crate) struct LayerFringe { - // TODO: rename members - reads_by_lsn: BinaryHeap, + planned_reads_by_lsn: BinaryHeap, layers: HashMap, } impl LayerFringe { pub(crate) fn new() -> Self { LayerFringe { - reads_by_lsn: BinaryHeap::new(), + planned_reads_by_lsn: BinaryHeap::new(), layers: HashMap::new(), } } pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayer, KeySpace, Range)> { - let read_desc = match self.reads_by_lsn.pop() { + let read_desc = match self.planned_reads_by_lsn.pop() { Some(desc) => desc, None => return None, }; @@ -295,7 +298,7 @@ impl LayerFringe { entry.get_mut().1.merge(&keyspace); } Entry::Vacant(entry) => { - self.reads_by_lsn.push(ReadDesc { + self.planned_reads_by_lsn.push(ReadDesc { lsn_range, layer_key: key.clone(), }); From f7825b4ced9cb1e7af6c7aabc1b03e53ae9e9aaf Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Sat, 23 Mar 2024 16:05:13 +0000 Subject: [PATCH 03/11] pageserver: drop layer map lock in Timeline::get --- pageserver/src/tenant/timeline.rs | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index f3eedc93bf22..2834ab097cb3 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -2596,6 +2596,10 @@ impl Timeline { // Get all the data needed to reconstruct the page version from this layer. // But if we have an older cached page image, no need to go past that. let lsn_floor = max(cached_lsn + 1, start_lsn); + + let open_layer = open_layer.clone(); + drop(guard); + result = match open_layer .get_value_reconstruct_data( key, @@ -2613,10 +2617,7 @@ impl Timeline { traversal_path.push(( result, cont_lsn, - Box::new({ - let open_layer = Arc::clone(open_layer); - move || open_layer.traversal_id() - }), + Box::new(move || open_layer.traversal_id()), )); continue 'outer; } @@ -2626,6 +2627,10 @@ impl Timeline { if cont_lsn > start_lsn { //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display()); let lsn_floor = max(cached_lsn + 1, start_lsn); + + let frozen_layer = frozen_layer.clone(); + drop(guard); + result = match frozen_layer .get_value_reconstruct_data( key, @@ -2643,10 +2648,7 @@ impl Timeline { traversal_path.push(( result, cont_lsn, - Box::new({ - let frozen_layer = Arc::clone(frozen_layer); - move || frozen_layer.traversal_id() - }), + Box::new(move || frozen_layer.traversal_id()), )); continue 'outer; } @@ -2654,6 +2656,8 @@ impl Timeline { if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) { let layer = guard.get_from_desc(&layer); + drop(guard); + // Get all the data needed to reconstruct the page version from this layer. // But if we have an older cached page image, no need to go past that. let lsn_floor = max(cached_lsn + 1, lsn_floor); From 6290e5a75cba0da3666dd660596c1d2c97e508d8 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Tue, 26 Mar 2024 17:26:34 +0000 Subject: [PATCH 04/11] review: keep some utils private --- pageserver/src/tenant/storage_layer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 8bd613e249a0..5c25f9987657 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -221,7 +221,7 @@ pub(crate) enum ReadableLayer { } #[derive(Debug, Clone)] -pub(crate) struct ReadDesc { +struct ReadDesc { layer_key: LayerKey, lsn_range: Range, } From 23745c232e1fc4102a0360a2a766fbecbd5fe890 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Tue, 26 Mar 2024 17:36:14 +0000 Subject: [PATCH 05/11] review: use page cache file id instead of path buf --- pageserver/src/tenant/ephemeral_file.rs | 4 ++-- pageserver/src/tenant/storage_layer/inmemory_layer.rs | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 51c11341efd2..b27230db0340 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -72,8 +72,8 @@ impl EphemeralFile { self.len } - pub(crate) fn path(&self) -> Utf8PathBuf { - self.file.path.clone() + pub(crate) fn id(&self) -> page_cache::FileId { + self.page_cache_file_id } pub(crate) async fn read_blk( diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index b03b23f8a2dd..4ebc6b71d2a1 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -12,7 +12,7 @@ use crate::tenant::ephemeral_file::EphemeralFile; use crate::tenant::storage_layer::ValueReconstructResult; use crate::tenant::timeline::GetVectoredError; use crate::tenant::{PageReconstructError, Timeline}; -use crate::walrecord; +use crate::{page_cache, walrecord}; use anyhow::{anyhow, ensure, Result}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::InMemoryLayerInfo; @@ -32,8 +32,8 @@ use super::{ ValuesReconstructState, }; -#[derive(Debug, PartialEq, Eq, Clone, Hash)] -pub(crate) struct InMemoryLayerKey(camino::Utf8PathBuf); +#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)] +pub(crate) struct InMemoryLayerKey(page_cache::FileId); pub struct InMemoryLayer { conf: &'static PageServerConf, @@ -84,7 +84,7 @@ impl std::fmt::Debug for InMemoryLayerInner { impl InMemoryLayer { pub(crate) fn key(&self) -> InMemoryLayerKey { - self.key.clone() + self.key } pub(crate) fn get_timeline_id(&self) -> TimelineId { @@ -326,7 +326,7 @@ impl InMemoryLayer { trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}"); let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?; - let key = InMemoryLayerKey(file.path()); + let key = InMemoryLayerKey(file.id()); Ok(InMemoryLayer { key, From ab8ce836f377aa42e01507e55c6b5509d16ed6bc Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Wed, 27 Mar 2024 09:27:23 +0000 Subject: [PATCH 06/11] review: move Ord impl for ReadDesc --- pageserver/src/tenant/storage_layer.rs | 50 +++++++++++++------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 5c25f9987657..21be25a87741 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -226,31 +226,6 @@ struct ReadDesc { lsn_range: Range, } -impl Ord for ReadDesc { - fn cmp(&self, other: &Self) -> Ordering { - let ord = self.lsn_range.end.cmp(&other.lsn_range.end); - if ord == std::cmp::Ordering::Equal { - self.lsn_range.start.cmp(&other.lsn_range.start).reverse() - } else { - ord - } - } -} - -impl PartialOrd for ReadDesc { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl PartialEq for ReadDesc { - fn eq(&self, other: &Self) -> bool { - self.lsn_range == other.lsn_range - } -} - -impl Eq for ReadDesc {} - /// Data structure which maintains a fringe of layers for the /// read path. The fringe is the set of layers which intersects /// the current keyspace that the search is descending on. @@ -314,6 +289,31 @@ impl Default for LayerFringe { } } +impl Ord for ReadDesc { + fn cmp(&self, other: &Self) -> Ordering { + let ord = self.lsn_range.end.cmp(&other.lsn_range.end); + if ord == std::cmp::Ordering::Equal { + self.lsn_range.start.cmp(&other.lsn_range.start).reverse() + } else { + ord + } + } +} + +impl PartialOrd for ReadDesc { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl PartialEq for ReadDesc { + fn eq(&self, other: &Self) -> bool { + self.lsn_range == other.lsn_range + } +} + +impl Eq for ReadDesc {} + impl ReadableLayer { pub(crate) fn key(&self) -> LayerKey { match self { From 0516d0e6de87c40588ca700bbe960322eb9ad747 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Wed, 27 Mar 2024 09:31:39 +0000 Subject: [PATCH 07/11] review: key -> id renames --- pageserver/src/tenant/storage_layer.rs | 18 +++++++++--------- .../src/tenant/storage_layer/inmemory_layer.rs | 8 ++++---- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 21be25a87741..a781b4dfceb7 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -41,7 +41,7 @@ pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey}; pub(crate) use layer::{EvictionError, Layer, ResidentLayer}; -use self::inmemory_layer::InMemoryLayerKey; +use self::inmemory_layer::InMemoryLayerFileId; use super::timeline::GetVectoredError; use super::PageReconstructError; @@ -206,9 +206,9 @@ impl Default for ValuesReconstructState { /// A key that uniquely identifies a layer in a timeline #[derive(Debug, PartialEq, Eq, Clone, Hash)] -pub(crate) enum LayerKey { - PersitentLayerKey(PersistentLayerKey), - InMemoryLayerKey(InMemoryLayerKey), +pub(crate) enum LayerId { + PersitentLayerId(PersistentLayerKey), + InMemoryLayerId(InMemoryLayerFileId), } /// Layer wrapper for the read path. Note that it is valid @@ -222,7 +222,7 @@ pub(crate) enum ReadableLayer { #[derive(Debug, Clone)] struct ReadDesc { - layer_key: LayerKey, + layer_key: LayerId, lsn_range: Range, } @@ -236,7 +236,7 @@ struct ReadDesc { #[derive(Debug)] pub(crate) struct LayerFringe { planned_reads_by_lsn: BinaryHeap, - layers: HashMap, + layers: HashMap, } impl LayerFringe { @@ -315,10 +315,10 @@ impl PartialEq for ReadDesc { impl Eq for ReadDesc {} impl ReadableLayer { - pub(crate) fn key(&self) -> LayerKey { + pub(crate) fn key(&self) -> LayerId { match self { - Self::PersistentLayer(layer) => LayerKey::PersitentLayerKey(layer.layer_desc().key()), - Self::InMemoryLayer(layer) => LayerKey::InMemoryLayerKey(layer.key()), + Self::PersistentLayer(layer) => LayerId::PersitentLayerId(layer.layer_desc().key()), + Self::InMemoryLayer(layer) => LayerId::InMemoryLayerId(layer.key()), } } diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 4ebc6b71d2a1..100b14428934 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -33,13 +33,13 @@ use super::{ }; #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)] -pub(crate) struct InMemoryLayerKey(page_cache::FileId); +pub(crate) struct InMemoryLayerFileId(page_cache::FileId); pub struct InMemoryLayer { conf: &'static PageServerConf, tenant_shard_id: TenantShardId, timeline_id: TimelineId, - key: InMemoryLayerKey, + key: InMemoryLayerFileId, /// This layer contains all the changes from 'start_lsn'. The /// start is inclusive. @@ -83,7 +83,7 @@ impl std::fmt::Debug for InMemoryLayerInner { } impl InMemoryLayer { - pub(crate) fn key(&self) -> InMemoryLayerKey { + pub(crate) fn key(&self) -> InMemoryLayerFileId { self.key } @@ -326,7 +326,7 @@ impl InMemoryLayer { trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}"); let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?; - let key = InMemoryLayerKey(file.id()); + let key = InMemoryLayerFileId(file.id()); Ok(InMemoryLayer { key, From 34449a3f3e3ba295eb21aedd53c40cede20337e3 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Wed, 27 Mar 2024 09:31:49 +0000 Subject: [PATCH 08/11] review: remove dead InMemoryLayerHandle struct --- pageserver/src/tenant/layer_map.rs | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 892f1d9e4bb5..4c4cd90c99b6 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -346,35 +346,6 @@ where } } -#[derive(PartialEq, Eq, Hash, Debug, Clone)] -pub enum InMemoryLayerHandle { - Open { - lsn_floor: Lsn, - end_lsn: Lsn, - }, - Frozen { - idx: usize, - lsn_floor: Lsn, - end_lsn: Lsn, - }, -} - -impl InMemoryLayerHandle { - pub fn get_lsn_floor(&self) -> Lsn { - match self { - InMemoryLayerHandle::Open { lsn_floor, .. } => *lsn_floor, - InMemoryLayerHandle::Frozen { lsn_floor, .. } => *lsn_floor, - } - } - - pub fn get_end_lsn(&self) -> Lsn { - match self { - InMemoryLayerHandle::Open { end_lsn, .. } => *end_lsn, - InMemoryLayerHandle::Frozen { end_lsn, .. } => *end_lsn, - } - } -} - impl LayerMap { /// /// Find the latest layer (by lsn.end) that covers the given From 2f682a936fe281ba79568ab7f389dce928af0fb1 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Wed, 27 Mar 2024 09:42:21 +0000 Subject: [PATCH 09/11] review: replace inline tuple with a struct --- pageserver/src/tenant/storage_layer.rs | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index a781b4dfceb7..7ae31b53875f 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -220,9 +220,12 @@ pub(crate) enum ReadableLayer { InMemoryLayer(Arc), } +/// A partial description of a read to be done. #[derive(Debug, Clone)] struct ReadDesc { + /// A key used to resolve the readable layer within the fringe layer_key: LayerId, + /// Lsn range for the read, used for selecting the next read lsn_range: Range, } @@ -236,7 +239,13 @@ struct ReadDesc { #[derive(Debug)] pub(crate) struct LayerFringe { planned_reads_by_lsn: BinaryHeap, - layers: HashMap, + layers: HashMap, +} + +#[derive(Debug)] +struct LayerKeyspace { + layer: ReadableLayer, + target_keyspace: KeySpace, } impl LayerFringe { @@ -255,7 +264,13 @@ impl LayerFringe { let removed = self.layers.remove_entry(&read_desc.layer_key); match removed { - Some((_, (layer, keyspace))) => Some((layer, keyspace, read_desc.lsn_range)), + Some(( + _, + LayerKeyspace { + layer, + target_keyspace, + }, + )) => Some((layer, target_keyspace, read_desc.lsn_range)), None => unreachable!("fringe internals are always consistent"), } } @@ -270,14 +285,17 @@ impl LayerFringe { let entry = self.layers.entry(key.clone()); match entry { Entry::Occupied(mut entry) => { - entry.get_mut().1.merge(&keyspace); + entry.get_mut().target_keyspace.merge(&keyspace); } Entry::Vacant(entry) => { self.planned_reads_by_lsn.push(ReadDesc { lsn_range, layer_key: key.clone(), }); - entry.insert((layer, keyspace)); + entry.insert(LayerKeyspace { + layer, + target_keyspace: keyspace, + }); } } } From c37f67d6d2b678cbd549acd70357075cc58e355b Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Wed, 27 Mar 2024 15:36:46 +0000 Subject: [PATCH 10/11] review: more key -> id --- pageserver/src/tenant/storage_layer.rs | 16 ++++++++-------- .../src/tenant/storage_layer/inmemory_layer.rs | 8 ++++---- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 7ae31b53875f..a047522f2bb7 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -223,8 +223,8 @@ pub(crate) enum ReadableLayer { /// A partial description of a read to be done. #[derive(Debug, Clone)] struct ReadDesc { - /// A key used to resolve the readable layer within the fringe - layer_key: LayerId, + /// An id used to resolve the readable layer within the fringe + layer_id: LayerId, /// Lsn range for the read, used for selecting the next read lsn_range: Range, } @@ -262,7 +262,7 @@ impl LayerFringe { None => return None, }; - let removed = self.layers.remove_entry(&read_desc.layer_key); + let removed = self.layers.remove_entry(&read_desc.layer_id); match removed { Some(( _, @@ -281,8 +281,8 @@ impl LayerFringe { keyspace: KeySpace, lsn_range: Range, ) { - let key = layer.key(); - let entry = self.layers.entry(key.clone()); + let layer_id = layer.id(); + let entry = self.layers.entry(layer_id.clone()); match entry { Entry::Occupied(mut entry) => { entry.get_mut().target_keyspace.merge(&keyspace); @@ -290,7 +290,7 @@ impl LayerFringe { Entry::Vacant(entry) => { self.planned_reads_by_lsn.push(ReadDesc { lsn_range, - layer_key: key.clone(), + layer_id: layer_id.clone(), }); entry.insert(LayerKeyspace { layer, @@ -333,10 +333,10 @@ impl PartialEq for ReadDesc { impl Eq for ReadDesc {} impl ReadableLayer { - pub(crate) fn key(&self) -> LayerId { + pub(crate) fn id(&self) -> LayerId { match self { Self::PersistentLayer(layer) => LayerId::PersitentLayerId(layer.layer_desc().key()), - Self::InMemoryLayer(layer) => LayerId::InMemoryLayerId(layer.key()), + Self::InMemoryLayer(layer) => LayerId::InMemoryLayerId(layer.file_id()), } } diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 100b14428934..7a70be51cd97 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -39,7 +39,7 @@ pub struct InMemoryLayer { conf: &'static PageServerConf, tenant_shard_id: TenantShardId, timeline_id: TimelineId, - key: InMemoryLayerFileId, + file_id: InMemoryLayerFileId, /// This layer contains all the changes from 'start_lsn'. The /// start is inclusive. @@ -83,8 +83,8 @@ impl std::fmt::Debug for InMemoryLayerInner { } impl InMemoryLayer { - pub(crate) fn key(&self) -> InMemoryLayerFileId { - self.key + pub(crate) fn file_id(&self) -> InMemoryLayerFileId { + self.file_id } pub(crate) fn get_timeline_id(&self) -> TimelineId { @@ -329,7 +329,7 @@ impl InMemoryLayer { let key = InMemoryLayerFileId(file.id()); Ok(InMemoryLayer { - key, + file_id: key, conf, timeline_id, tenant_shard_id, From b1feb1fa006c3d9526b2bea34fe41509c092082a Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Wed, 27 Mar 2024 15:43:05 +0000 Subject: [PATCH 11/11] review: musings on correctness --- pageserver/src/tenant/timeline.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 2834ab097cb3..8727cc304d87 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -2825,6 +2825,13 @@ impl Timeline { // It's safe to drop the layer map lock after planning the next round of reads. // The fringe keeps readable handles for the layers which are safe to read even // if layers were compacted or flushed. + // + // The more interesting consideration is: "Why is the read algorithm still correct + // if the layer map changes while it is operating?". Doing a vectored read on a + // timeline boils down to pushing an imaginary lsn boundary downwards for each range + // covered by the read. The layer map tells us how to move the lsn downwards for a + // range at *a particular point in time*. It is fine for the answer to be different + // at two different time points. drop(guard); if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() {