Skip to content

Commit

Permalink
perf: improve merge parquet speed when fields too many (#3231)
Browse files Browse the repository at this point in the history
  • Loading branch information
haohuaijin committed Apr 15, 2024
1 parent 38abbcb commit d654dd2
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 5 deletions.
4 changes: 3 additions & 1 deletion src/job/files/parquet.rs
Original file line number Diff line number Diff line change
Expand Up @@ -473,6 +473,7 @@ async fn merge_files(
let full_text_search_fields = stream::get_stream_setting_fts_fields(latest_schema).unwrap();
let mut buf = Vec::new();
let mut fts_buf = Vec::new();
let start = std::time::Instant::now();
let (mut new_file_meta, _) = match merge_parquet_files(
tmp_dir.name(),
stream_type,
Expand Down Expand Up @@ -520,11 +521,12 @@ async fn merge_files(
let new_file_key =
super::generate_storage_file_name(&org_id, stream_type, &stream_name, &file_name);
log::info!(
"[INGESTER:JOB:{thread_id}] merge file succeeded, {} files into a new file: {}, original_size: {}, compressed_size: {}",
"[INGESTER:JOB:{thread_id}] merge file succeeded, {} files into a new file: {}, original_size: {}, compressed_size: {}, took: {:?}",
retain_file_list.len(),
new_file_key,
new_file_meta.original_size,
new_file_meta.compressed_size,
start.elapsed().as_millis(),
);

let buf = Bytes::from(buf);
Expand Down
4 changes: 3 additions & 1 deletion src/service/compact/merge.rs
Original file line number Diff line number Diff line change
Expand Up @@ -557,6 +557,7 @@ async fn merge_files(

let mut buf = Vec::new();
let mut fts_buf = Vec::new();
let start = std::time::Instant::now();
let (mut new_file_meta, _) = datafusion::exec::merge_parquet_files(
tmp_dir.name(),
stream_type,
Expand Down Expand Up @@ -594,11 +595,12 @@ async fn merge_files(
let id = ider::generate();
let new_file_key = format!("{prefix}/{id}{}", FILE_EXT_PARQUET);
log::info!(
"[COMPACT] merge file succeeded, {} files into a new file: {}, original_size: {}, compressed_size: {}",
"[COMPACT] merge file succeeded, {} files into a new file: {}, original_size: {}, compressed_size: {}, took: {:?}",
retain_file_list.len(),
new_file_key,
new_file_meta.original_size,
new_file_meta.compressed_size,
start.elapsed().as_millis(),
);

let buf = Bytes::from(buf);
Expand Down
14 changes: 11 additions & 3 deletions src/service/search/datafusion/exec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1068,10 +1068,10 @@ pub async fn merge_parquet_files(
let prefix = ListingTableUrl::parse(format!("tmpfs:///{trace_id}/"))?;
let config = ListingTableConfig::new(prefix)
.with_listing_options(listing_options)
.with_schema(schema);
.with_schema(schema.clone());

let table = ListingTable::try_new(config)?;
ctx.register_table("tbl", Arc::new(table))?;
let table = Arc::new(ListingTable::try_new(config)?);
ctx.register_table("tbl", table.clone())?;

// get meta data
let meta_sql = format!(
Expand Down Expand Up @@ -1133,6 +1133,14 @@ pub async fn merge_parquet_files(
)
};

let select_wildcard = query_sql.to_lowercase().starts_with("select * ");
let without_optimizer = select_wildcard
&& CONFIG.limit.query_optimization_num_fields > 0
&& schema.fields().len() > CONFIG.limit.query_optimization_num_fields
&& stream_type != StreamType::Index;
let ctx = prepare_datafusion_context(None, &SearchType::Normal, without_optimizer)?;
ctx.register_table("tbl", table.clone())?;

let df = ctx.sql(&query_sql).await?;
let schema: Schema = df.schema().into();
let schema = Arc::new(schema);
Expand Down

0 comments on commit d654dd2

Please sign in to comment.