# Apache Hudi Bloom Index Performance Impact
This notebook demonstrates the performance impact of using Bloom indexing in Apache Hudi with a large synthetic dataset.

In [7]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("SimpleHudiCreate") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.sql.extensions", "org.apache.spark.sql.hudi.HoodieSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.hudi.catalog.HoodieCatalog") \
    .getOrCreate()

In [8]:
from pyspark.sql.functions import expr

def generate_large_dataset(num_records):
    return spark.range(0, num_records).withColumn("name", expr("concat('user_', id)")) \
                                      .withColumn("email", expr("concat('user_', id, '@example.com')")) \
                                      .withColumn("timestamp", expr("current_timestamp()"))

num_records = 1000000
df = generate_large_dataset(num_records)
df.show(5)

+---+------+------------------+--------------------+
| id|  name|             email|           timestamp|
+---+------+------------------+--------------------+
|  0|user_0|user_0@example.com|2025-07-14 14:46:...|
|  1|user_1|user_1@example.com|2025-07-14 14:46:...|
|  2|user_2|user_2@example.com|2025-07-14 14:46:...|
|  3|user_3|user_3@example.com|2025-07-14 14:46:...|
|  4|user_4|user_4@example.com|2025-07-14 14:46:...|
+---+------+------------------+--------------------+
only showing top 5 rows



In [9]:
def get_hudi_options(table_name, record_key, precombine_key, index_type):
    return {
        'hoodie.table.name': table_name,
        'hoodie.datasource.write.recordkey.field': record_key,
        'hoodie.datasource.write.precombine.field': precombine_key,
        'hoodie.datasource.write.table.name': table_name,
        'hoodie.datasource.write.operation': 'upsert',
        'hoodie.datasource.write.storage.type': 'COPY_ON_WRITE',
        'hoodie.datasource.write.index.type': index_type,
        'hoodie.datasource.hive_sync.enable': 'false'
    }

In [10]:
import time

path_bloom = "/home/jovyan/hudi/hudi_bloom_index"
start_bloom = time.time()
df.write.format("org.apache.hudi").options(**get_hudi_options("bloom_table", "id", "timestamp", "BLOOM")).mode("overwrite").save(path_bloom)
end_bloom = time.time()
print("Initial write with Bloom index took:", end_bloom - start_bloom, "seconds")

Initial write with Bloom index took: 50.55580449104309 seconds


In [5]:
path_simple = "/home/jovyan/hudi/hudi_simple_index"
start_simple = time.time()
df.write.format("org.apache.hudi").options(**get_hudi_options("simple_table", "id", "timestamp", "SIMPLE")).mode("overwrite").save(path_simple)
end_simple = time.time()
print("Initial write with Simple index took:", end_simple - start_simple, "seconds")

Initial write with Simple index took: 54.582783699035645 seconds


In [6]:
df_upsert = df.withColumn("email", expr("concat('updated_', email)"))

start_upsert_bloom = time.time()
df_upsert.write.format("org.apache.hudi").options(**get_hudi_options("bloom_table", "id", "timestamp", "BLOOM")).mode("append").save(path_bloom)
end_upsert_bloom = time.time()

start_upsert_simple = time.time()
df_upsert.write.format("org.apache.hudi").options(**get_hudi_options("simple_table", "id", "timestamp", "SIMPLE")).mode("append").save(path_simple)
end_upsert_simple = time.time()

print("Upsert with Bloom index took:", end_upsert_bloom - start_upsert_bloom, "seconds")
print("Upsert with Simple index took:", end_upsert_simple - start_upsert_simple, "seconds")

Upsert with Bloom index took: 70.63161420822144 seconds
Upsert with Simple index took: 62.40643382072449 seconds


## Conclusion
This notebook demonstrates how Bloom indexing in Apache Hudi can significantly improve upsert performance compared to Simple indexing, especially on large datasets.