-
-
Notifications
You must be signed in to change notification settings - Fork 157
optimize performance of ingest path #1466
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -135,50 +135,107 @@ impl Stream { | |
| } | ||
|
|
||
| // Concatenates record batches and puts them in memory store for each event. | ||
| pub fn push( | ||
| &self, | ||
| // This method now defers memtable and disk operations to blocking thread pools. | ||
| // Disk write is awaited to ensure durability (data is persisted), while memtable push is fire-and-forget. | ||
| // If disk write fails, request fails - ensuring data consistency. | ||
| // If memtable push fails, data is still on disk | ||
| pub async fn push( | ||
| self: &Arc<Self>, | ||
| schema_key: &str, | ||
| record: &RecordBatch, | ||
| parsed_timestamp: NaiveDateTime, | ||
| custom_partition_values: &HashMap<String, String>, | ||
| stream_type: StreamType, | ||
| ) -> Result<(), StagingError> { | ||
| let mut guard = match self.writer.lock() { | ||
| Ok(guard) => guard, | ||
| Err(poisoned) => { | ||
| error!( | ||
| "Writer lock poisoned while ingesting data for stream {}", | ||
| self.stream_name | ||
| ); | ||
| poisoned.into_inner() | ||
| } | ||
| }; | ||
| if self.options.mode != Mode::Query || stream_type == StreamType::Internal { | ||
| let record_clone = record.clone(); | ||
| let schema_key_clone = schema_key.to_string(); | ||
| let options_mode = self.options.mode; | ||
|
|
||
| // Defer disk write to blocking thread pool and await it | ||
| // This ensures disk write succeeds before we return, maintaining durability | ||
| // If disk write fails, request fails - data is not persisted, so we shouldn't return success | ||
| if options_mode != Mode::Query || stream_type == StreamType::Internal { | ||
| let filename = | ||
| self.filename_by_partition(schema_key, parsed_timestamp, custom_partition_values); | ||
| match guard.disk.get_mut(&filename) { | ||
| Some(writer) => { | ||
| writer.write(record)?; | ||
| } | ||
| None => { | ||
| // entry is not present thus we create it | ||
| std::fs::create_dir_all(&self.data_path)?; | ||
|
|
||
| let stream_for_disk = Arc::clone(self); | ||
| let filename_clone = filename.clone(); | ||
| let filename_for_error = filename.clone(); | ||
| let record_for_disk = record_clone.clone(); | ||
| let parsed_timestamp_for_disk = parsed_timestamp; | ||
| let stream_name_clone = self.stream_name.clone(); | ||
|
|
||
| // Await disk write | ||
| tokio::task::spawn_blocking(move || { | ||
| let mut guard = match stream_for_disk.writer.lock() { | ||
| Ok(guard) => guard, | ||
| Err(poisoned) => { | ||
| error!( | ||
| "Writer lock poisoned while writing to disk for stream {}", | ||
| stream_for_disk.stream_name | ||
| ); | ||
| poisoned.into_inner() | ||
| } | ||
| }; | ||
|
|
||
| match guard.disk.get_mut(&filename_clone) { | ||
| Some(writer) => { | ||
| // Blocking disk write - runs in background thread pool | ||
| writer.write(&record_for_disk) | ||
| } | ||
| None => { | ||
| std::fs::create_dir_all(&stream_for_disk.data_path)?; | ||
|
|
||
| let range = TimeRange::granularity_range( | ||
| parsed_timestamp.and_local_timezone(Utc).unwrap(), | ||
| OBJECT_STORE_DATA_GRANULARITY, | ||
| ); | ||
| let file_path = self.data_path.join(&filename); | ||
| let mut writer = DiskWriter::try_new(file_path, &record.schema(), range) | ||
| .expect("File and RecordBatch both are checked"); | ||
| let range = TimeRange::granularity_range( | ||
| parsed_timestamp_for_disk.and_local_timezone(Utc).unwrap(), | ||
| OBJECT_STORE_DATA_GRANULARITY, | ||
| ); | ||
| let file_path = stream_for_disk.data_path.join(&filename_clone); | ||
| let mut writer = DiskWriter::try_new(file_path, &record_for_disk.schema(), range)?; | ||
|
|
||
| writer.write(record)?; | ||
| guard.disk.insert(filename, writer); | ||
| writer.write(&record_for_disk)?; | ||
| guard.disk.insert(filename_clone, writer); | ||
| Ok(()) | ||
| } | ||
| } | ||
| }; | ||
| }) | ||
| .await | ||
| .map_err(|e| StagingError::ObjectStorage(std::io::Error::new( | ||
| std::io::ErrorKind::Other, | ||
| format!("Disk write task failed: {}", e) | ||
| )))? | ||
coderabbitai[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| .map_err(|e| { | ||
| error!( | ||
| "Disk write failed for stream {} file {}: {}", | ||
| stream_name_clone, filename_for_error, e | ||
| ); | ||
| e | ||
| })?; | ||
| } | ||
|
|
||
| guard.mem.push(schema_key, record); | ||
| // Defer memtable push to blocking thread pool - fire-and-forget | ||
| // If memtable push fails, data is still safely on disk and memtable can be rebuilt | ||
| { | ||
| let stream_for_memtable = Arc::clone(self); | ||
| let schema_key_for_memtable = schema_key_clone; | ||
| let record_for_memtable = record.clone(); | ||
|
|
||
| // TODO: The concat operation at 16384 events is CPU-bound but won't block the request path | ||
| tokio::task::spawn_blocking(move || { | ||
| let mut guard = match stream_for_memtable.writer.lock() { | ||
| Ok(guard) => guard, | ||
| Err(poisoned) => { | ||
| error!( | ||
| "Writer lock poisoned while pushing to memtable for stream {}", | ||
| stream_for_memtable.stream_name | ||
| ); | ||
| poisoned.into_inner() | ||
| } | ||
|
Comment on lines
+225
to
+233
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same lock poisoning issue: should fail instead of continuing. Identical to the disk write path, using Apply similar fix as suggested for disk write path - return early or log and skip the operation: tokio::task::spawn_blocking(move || {
- let mut guard = match stream_for_memtable.writer.lock() {
- Ok(guard) => guard,
- Err(poisoned) => {
- error!(
- "Writer lock poisoned while pushing to memtable for stream {}",
- stream_for_memtable.stream_name
- );
- poisoned.into_inner()
- }
- };
+ let mut guard = match stream_for_memtable.writer.lock() {
+ Ok(guard) => guard,
+ Err(e) => {
+ error!(
+ "Writer lock poisoned while pushing to memtable for stream {}: {}. Skipping memtable update.",
+ stream_for_memtable.stream_name, e
+ );
+ return;
+ }
+ };🤖 Prompt for AI Agents |
||
| }; | ||
| // Push to memtable - concat happens here at 16384 events | ||
| guard.mem.push(&schema_key_for_memtable, &record_for_memtable); | ||
| }); | ||
| } | ||
|
|
||
| Ok(()) | ||
| } | ||
|
|
@@ -1328,15 +1385,43 @@ mod tests { | |
| ], | ||
| ) | ||
| .unwrap(); | ||
| staging | ||
| .push( | ||
| "abc", | ||
| &batch, | ||
| time, | ||
| &HashMap::new(), | ||
| StreamType::UserDefined, | ||
| ) | ||
| .unwrap(); | ||
|
|
||
| // For sync tests - create a new runtime | ||
| let rt = tokio::runtime::Runtime::new().unwrap(); | ||
| rt.block_on(staging.push( | ||
| "abc", | ||
| &batch, | ||
| time, | ||
| &HashMap::new(), | ||
| StreamType::UserDefined, | ||
| )).unwrap(); | ||
| staging.flush(true); | ||
| } | ||
|
|
||
| // Async version for async tests (like #[tokio::test]) | ||
| async fn write_log_async(staging: &StreamRef, schema: &Schema, mins: i64) { | ||
| let time: NaiveDateTime = Utc::now() | ||
| .checked_sub_signed(TimeDelta::minutes(mins)) | ||
| .unwrap() | ||
| .naive_utc(); | ||
| let batch = RecordBatch::try_new( | ||
| Arc::new(schema.clone()), | ||
| vec![ | ||
| Arc::new(TimestampMillisecondArray::from(vec![1, 2, 3])), | ||
| Arc::new(Int32Array::from(vec![1, 2, 3])), | ||
| Arc::new(StringArray::from(vec!["a", "b", "c"])), | ||
| ], | ||
| ) | ||
| .unwrap(); | ||
|
|
||
| // Direct await - we're already in an async context | ||
| staging.push( | ||
| "abc", | ||
| &batch, | ||
| time, | ||
| &HashMap::new(), | ||
| StreamType::UserDefined, | ||
| ).await.unwrap(); | ||
| staging.flush(true); | ||
| } | ||
|
|
||
|
|
@@ -1468,11 +1553,11 @@ mod tests { | |
|
|
||
| // 2 logs in the previous minutes | ||
| for i in 0..2 { | ||
| write_log(&staging, &schema, i); | ||
| write_log_async(&staging, &schema, i).await; | ||
| } | ||
| sleep(Duration::from_secs(60)).await; | ||
|
|
||
| write_log(&staging, &schema, 0); | ||
| write_log_async(&staging, &schema, 0).await; | ||
|
|
||
| // verify the arrow files exist in staging | ||
| assert_eq!(staging.arrow_files().len(), 3); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Lock poisoning should fail the operation, not continue with potentially corrupt state.
Using
into_inner()after a poisoned lock allows the operation to proceed with potentially corrupt state. For data durability guarantees, this should fail fast.Apply this diff to fail on poisoned locks:
tokio::task::spawn_blocking(move || { - let mut guard = match stream_for_disk.writer.lock() { - Ok(guard) => guard, - Err(poisoned) => { - error!( - "Writer lock poisoned while writing to disk for stream {}", - stream_for_disk.stream_name - ); - poisoned.into_inner() - } - }; + let mut guard = stream_for_disk.writer.lock() + .map_err(|e| StagingError::ObjectStorage(std::io::Error::new( + std::io::ErrorKind::Other, + format!("Writer lock poisoned for stream {}: {}", stream_for_disk.stream_name, e) + )))?;🤖 Prompt for AI Agents