From 22b967dc5ca11c26a1bf777c4e36d3d044604b11 Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Tue, 11 Nov 2025 18:20:36 -0500 Subject: [PATCH 1/2] fix: bytes scanned in query instead of using file_size from manifest -- which is size of json we should use ingestion_size -- which is compressed size --- src/query/stream_schema_provider.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/query/stream_schema_provider.rs b/src/query/stream_schema_provider.rs index b3907a9e9..1d8163fae 100644 --- a/src/query/stream_schema_provider.rs +++ b/src/query/stream_schema_provider.rs @@ -339,13 +339,13 @@ impl StandardTableProvider { mut file_path, num_rows, columns, - file_size, + ingestion_size, .. } = file; // Track billing metrics for files scanned in query file_count += 1; - total_file_size += file_size; + total_file_size += ingestion_size; // object_store::path::Path doesn't automatically deal with Windows path separators // to do that, we are using from_absolute_path() which takes into consideration the underlying filesystem From 1c7f99680689043ba61b15aac0f72c6d94b54d02 Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Mon, 17 Nov 2025 14:10:55 -0800 Subject: [PATCH 2/2] update the compressed size for query bytes scanned --- src/query/stream_schema_provider.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/query/stream_schema_provider.rs b/src/query/stream_schema_provider.rs index 1d8163fae..6c151fb44 100644 --- a/src/query/stream_schema_provider.rs +++ b/src/query/stream_schema_provider.rs @@ -327,7 +327,7 @@ impl StandardTableProvider { let mut partitioned_files = Vec::from_iter((0..target_partition).map(|_| Vec::new())); let mut column_statistics = HashMap::>::new(); let mut count = 0; - let mut total_file_size = 0u64; + let mut total_compressed_size = 0u64; let mut file_count = 0u64; for (index, file) in manifest_files .into_iter() @@ -339,13 +339,14 @@ impl StandardTableProvider { mut file_path, num_rows, columns, - ingestion_size, .. } = file; // Track billing metrics for files scanned in query file_count += 1; - total_file_size += ingestion_size; + // Calculate actual compressed bytes that will be read from storage + let compressed_bytes: u64 = columns.iter().map(|col| col.compressed_size).sum(); + total_compressed_size += compressed_bytes; // object_store::path::Path doesn't automatically deal with Windows path separators // to do that, we are using from_absolute_path() which takes into consideration the underlying filesystem @@ -406,7 +407,8 @@ impl StandardTableProvider { // Track billing metrics for query scan let current_date = chrono::Utc::now().date_naive().to_string(); increment_files_scanned_in_query_by_date(file_count, ¤t_date); - increment_bytes_scanned_in_query_by_date(total_file_size, ¤t_date); + // Use compressed size as it represents actual bytes read from storage (S3/object store charges) + increment_bytes_scanned_in_query_by_date(total_compressed_size, ¤t_date); (partitioned_files, statistics) }