In [1]:
import os
import shutil
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder \
    .appName("SimpleHudiCreate") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.sql.extensions", "org.apache.spark.sql.hudi.HoodieSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.hudi.catalog.HoodieCatalog") \
    .getOrCreate()


In [3]:
# Function to calculate directory size
def get_directory_size(directory):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(directory):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    return total_size / (1024 * 1024)  # Convert to MB

In [4]:
# Initial Data
initial_data = spark.createDataFrame([
    (1, "Alice", 5000),
    (2, "Bob", 6000),
    (3, "Charlie", 5500)
], ["id", "name", "amount"])

In [8]:
initial_data.show()

+---+-------+------+
| id|   name|amount|
+---+-------+------+
|  1|  Alice|  5000|
|  2|    Bob|  6000|
|  3|Charlie|  5500|
+---+-------+------+



In [9]:
# CoW Table Configuration
cow_options = {
    'hoodie.table.name': 'cow_storage_table',
    'hoodie.datasource.write.table.type': 'COPY_ON_WRITE',
    'hoodie.datasource.write.recordkey.field': 'id',
    'hoodie.datasource.write.precombine.field': 'amount'
}

# MoR Table Configuration
mor_options = {
    'hoodie.table.name': 'mor_storage_table',
    'hoodie.datasource.write.table.type': 'MERGE_ON_READ',
    'hoodie.datasource.write.recordkey.field': 'id',
    'hoodie.datasource.write.precombine.field': 'amount'
}

In [14]:
base_path = "/home/jovyan/hudi"

# Paths for storage analysis
cow_path = f"{base_path}/cow_storage"
mor_path = f"{base_path}/mor_storage"

In [15]:
# Clean previous runs
shutil.rmtree(cow_path, ignore_errors=True)
shutil.rmtree(mor_path, ignore_errors=True)

In [16]:
# Initial Write
initial_data.write \
    .format("hudi") \
    .options(**cow_options) \
    .mode("overwrite") \
    .save(cow_path)

initial_data.write \
    .format("hudi") \
    .options(**mor_options) \
    .mode("overwrite") \
    .save(mor_path)

In [17]:
# Upsert Data
upsert_data = spark.createDataFrame([
    (2, "Bob Updated", 7000),  # Update existing
    (4, "David", 5200)         # New record
], ["id", "name", "amount"])

In [18]:
# Perform Upserts
upsert_data.write \
    .format("hudi") \
    .options(**cow_options) \
    .mode("append") \
    .save(cow_path)

upsert_data.write \
    .format("hudi") \
    .options(**mor_options) \
    .mode("append") \
    .save(mor_path)

In [20]:
# Storage Analysis
cow_storage = get_directory_size(cow_path)
mor_storage = get_directory_size(mor_path)
# File Count Analysis
cow_files = len(spark.read.format("hudi").load(cow_path).inputFiles())
mor_files = len(spark.read.format("hudi").load(mor_path).inputFiles())

In [21]:
# Reporting
print("📊 Storage Impact Analysis")
print("-" * 40)
print(f"COPY_ON_WRITE Storage: {cow_storage:.2f} MB")
print(f"MERGE_ON_READ Storage: {mor_storage:.2f} MB")
print("\n🗂️ File Count Analysis")
print("-" * 40)
print(f"COPY_ON_WRITE Files: {cow_files}")
print(f"MERGE_ON_READ Files: {mor_files}")

📊 Storage Impact Analysis
----------------------------------------
COPY_ON_WRITE Storage: 0.99 MB
MERGE_ON_READ Storage: 0.99 MB

🗂️ File Count Analysis
----------------------------------------
COPY_ON_WRITE Files: 1
MERGE_ON_READ Files: 1


In [22]:
# Detailed File Listing
print("\n📁 CoW Files:")
for file in spark.read.format("hudi").load(cow_path).inputFiles():
    print(file)

print("\n📁 MoR Files:")
for file in spark.read.format("hudi").load(mor_path).inputFiles():
    print(file)


📁 CoW Files:
file:/home/jovyan/hudi/cow_storage/be7cfb4c-a047-48c3-8c58-c318290c9a6b-0_0-94-90_20250714081608124.parquet

📁 MoR Files:
file:/home/jovyan/hudi/mor_storage/2cd8f308-5e80-4459-a8a9-bee15a18d448-0_0-121-117_20250714081612603.parquet
