In [4]:
from pyspark.sql import SparkSession

In [2]:
import os

# Ensure the warehouse path exists
warehouse_path = "/home/jovyan/iceberg/warehouse"
os.makedirs(warehouse_path, exist_ok=True)

In [5]:
# Initialize SparkSession with Iceberg Hadoop catalog
spark = SparkSession.builder \
    .appName("IcebergLocalSetup") \
    .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.local.type", "hadoop") \
    .config("spark.sql.catalog.local.warehouse", warehouse_path) \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")
print("✅ SparkSession initialized with local Iceberg Hadoop catalog")

✅ SparkSession initialized with local Iceberg Hadoop catalog


# Iceberg Supported File Formats

Iceberg supports multiple columnar file formats:
- Parquet
- ORC
- Avro

In [6]:
# Set up Spark to write in different formats
df = spark.createDataFrame([(1, 'Alice'), (2, 'Bob')], ['id', 'name'])

In [7]:
# Write as Parquet
df.writeTo("local.db.parquet_table").using("iceberg").tableProperty("format-version", "2").createOrReplace()

In [8]:
# Write as ORC
df.write.format("orc").mode("overwrite").save("/home/jovyan/output/orc_table")

In [9]:
df.write.format("avro").mode("overwrite").save("/home/jovyan/output/avro_table")
