Ingestion data to Iceberg using Pyspark

In [1]:
import subprocess


# Function to install Maven (Linux/macOS only; Windows users must install manually)
def install_maven():
    try:
        # Check if Maven is already installed
        subprocess.run(["mvn", "-version"], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        print("Maven is already installed.")
    except FileNotFoundError:
        print("Maven not found. Installing Maven...")
        subprocess.run(["apt", "update"], check=True)
        subprocess.run(["apt", "install", "-y", "maven"], check=True)  # For Linux (apt)
        print("Maven installed successfully.")


# Function to download a specific JAR using Maven
def download_jar(group_id, artifact_id, version):
    try:
        print(f"Downloading JAR: {group_id}:{artifact_id}:{version}")
        subprocess.run([
            "mvn", "dependency:copy",
            f"-Dartifact={group_id}:{artifact_id}:{version}"
        ], check=True)
        print(f"JAR downloaded successfully")
    except subprocess.CalledProcessError as e:
        print(f"Failed to download JAR: {e}")


# Ensure Maven is installed
install_maven()
# Download required JAR(s)
download_jar("org.slf4j", "slf4j-api", "1.7.30")


Maven is already installed.
Downloading JAR: org.slf4j:slf4j-api:1.7.30
[INFO] Scanning for projects...
[INFO] 
[INFO] ------------------< org.apache.maven:standalone-pom >-------------------
[INFO] Building Maven Stub Project (No POM) 1
[INFO] --------------------------------[ pom ]---------------------------------
[INFO] 
[INFO] --- dependency:3.7.0:copy (default-cli) @ standalone-pom ---
[INFO] Configured Artifact: org.slf4j:slf4j-api:1.7.30:jar
[INFO] org.slf4j:slf4j-api:1.7.30:jar already exists in /Users/truongngocson/Documents/Projects/apache-iceberg/notebooks/${project.basedir}/target/dependency
[INFO] ------------------------------------------------------------------------
[INFO] BUILD SUCCESS
[INFO] ------------------------------------------------------------------------
[INFO] Total time:  0.471 s
[INFO] Finished at: 2025-08-27T11:10:01+07:00
[INFO] ------------------------------------------------------------------------
JAR downloaded successfully


Start the ingestion

In [2]:
import pyspark
import os
from pyspark.sql import SparkSession

## DEFINE VARIABLES
catalog_uri = os.getenv('CATALOG_URI', "http://localhost:19120/api/v1")
warehouse = "s3://warehouse/"
storage_uri = os.getenv('STORAGE_URI', "http://127.0.0.1:9000")
# Define the JDBC connection properties
jdbc_url = os.getenv('JDBC_URL', "jdbc:postgresql://localhost:5435/mydb")
properties = {
    "user": "myuser",
    "password": "mypassword",
    "driver": "org.postgresql.Driver"
}
local_jars = ','.join([
    'slf4j-api-1.7.30.jar'
])

## CONFIGURE SPARK SESSION
conf = (
    pyspark.SparkConf()
    .setAppName('Iceberg Ingestion')
    .set('spark.jars.packages',
         'org.postgresql:postgresql:42.7.3,'
         'org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.0,'
         'org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.77.1,'
         'software.amazon.awssdk:bundle:2.24.8,'
         'software.amazon.awssdk:url-connection-client:2.24.8')
    .set('spark.sql.extensions',
         'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,'
         'org.projectnessie.spark.extensions.NessieSparkSessionExtensions')
    .set('spark.sql.catalog.nessie', 'org.apache.iceberg.spark.SparkCatalog')
    .set('spark.sql.catalog.nessie.uri', catalog_uri)
    .set('spark.sql.catalog.nessie.ref', 'main')
    .set('spark.sql.catalog.nessie.authentication.type', 'NONE')
    .set('spark.sql.catalog.nessie.catalog-impl', 'org.apache.iceberg.nessie.NessieCatalog')
    .set('spark.sql.catalog.nessie.s3.endpoint', storage_uri)
    .set('spark.sql.catalog.nessie.warehouse', warehouse)
    .set('spark.sql.catalog.nessie.io-impl', 'org.apache.iceberg.aws.s3.S3FileIO')
    .set("spark.driver.memory", "2g")
    .set("spark.executor.memory", "2g")
    .set("spark.sql.shuffle.partitions", "64")
    .set("spark.sql.catalog.nessie.write.target-file-size-bytes", str(128 * 1024 * 1024))
)

## START SPARK SESSION
spark = SparkSession.builder.config(conf=conf).getOrCreate()

25/08/27 11:10:01 WARN Utils: Your hostname, MacBook-Air-cua-Ngoc-2.local resolves to a loopback address: 127.0.0.1; using 192.168.1.5 instead (on interface en0)
25/08/27 11:10:01 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /Users/truongngocson/.ivy2/cache
The jars for the packages stored in: /Users/truongngocson/.ivy2/jars
org.postgresql#postgresql added as a dependency
org.apache.iceberg#iceberg-spark-runtime-3.5_2.12 added as a dependency
org.projectnessie.nessie-integrations#nessie-spark-extensions-3.5_2.12 added as a dependency
software.amazon.awssdk#bundle added as a dependency
software.amazon.awssdk#url-connection-client added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-077232b4-d396-4fee-b3e1-f5d833b0e739;1.0
	confs: [default]
	found org.postgresql#postgresql;42.7.3 in central
	found org.checkerframework#checker-qual;3.42.0 in local-m2-cache
	found org.apache.iceberg#iceberg-spark-runtime-

:: loading settings :: url = jar:file:/Users/truongngocson/Documents/Projects/apache-iceberg/.venv/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found software.amazon.awssdk#url-connection-client;2.24.8 in central
	found software.amazon.awssdk#utils;2.24.8 in central
	found org.reactivestreams#reactive-streams;1.0.4 in local-m2-cache
	found software.amazon.awssdk#annotations;2.24.8 in central
	found org.slf4j#slf4j-api;1.7.30 in local-m2-cache
	found software.amazon.awssdk#http-client-spi;2.24.8 in central
	found software.amazon.awssdk#metrics-spi;2.24.8 in central
:: resolution report :: resolve 155ms :: artifacts dl 8ms
	:: modules in use:
	org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.5.0 from central in [default]
	org.checkerframework#checker-qual;3.42.0 from local-m2-cache in [default]
	org.postgresql#postgresql;42.7.3 from central in [default]
	org.projectnessie.nessie-integrations#nessie-spark-extensions-3.5_2.12;0.77.1 from central in [default]
	org.reactivestreams#reactive-streams;1.0.4 from local-m2-cache in [default]
	org.slf4j#slf4j-api;1.7.30 from local-m2-cache in [default]
	software.amazon.awssdk#annotatio

In [3]:
print("Spark Running")
spark.catalog.clearCache()

# Read the sales_data table from Postgres into a Spark DataFrame
sales_df = spark.read.jdbc(url=jdbc_url, table="hr_data", properties=properties)

# Show the first few rows of the dataset
sales_df.show()
# Control number of parallel output writers explicitly
sales_df = sales_df.repartition(8)


#Create a namespace
spark.sql("CREATE NAMESPACE IF NOT EXISTS nessie.hr;")

# Write the DataFrame to an Iceberg table in the Nessie catalog
sales_df.writeTo("nessie.hr.hr_data").createOrReplace()

# Verify that the data was written to Iceberg by reading the table
spark.read.table("nessie.hr.hr_data").show()

print("Ingested data successfully into Iceberg")

spark.stop()

Spark Running


                                                                                

+----------+-----------+--------------------+----------+--------------------+----------+--------------------+------------------+----------------+--------+---------+----------+
|Unnamed: 0|Employee_ID|           Full_Name|Department|           Job_Title| Hire_Date|            Location|Performance_Rating|Experience_Years|  Status|Work_Mode|Salary_INR|
+----------+-----------+--------------------+----------+--------------------+----------+--------------------+------------------+----------------+--------+---------+----------+
|   1021851| EMP1021852|Christopher Martinez|Operations|Logistics Coordin...|2024-01-16|Lake Kathleen, Fr...|                 3|               1|  Active|   Remote|    793553|
|   1021852| EMP1021853|        John Jenkins|        HR|Talent Acquisitio...|2025-04-19|East Mariastad, P...|                 1|               0|  Active|   Remote|   1162338|
|   1021853| EMP1021854|      Donna Williams|Operations|Operations Executive|2020-08-09|South Audrey, Sur...|           

25/08/27 11:12:25 WARN TaskMemoryManager: Failed to allocate a page (33554432 bytes), try again.
25/08/27 11:12:26 WARN TaskMemoryManager: Failed to allocate a page (33554432 bytes), try again.
25/08/27 11:12:27 WARN TaskMemoryManager: Failed to allocate a page (33554432 bytes), try again.
25/08/27 11:12:28 WARN TaskMemoryManager: Failed to allocate a page (33554432 bytes), try again.
25/08/27 11:12:29 WARN TaskMemoryManager: Failed to allocate a page (33554432 bytes), try again.
25/08/27 11:12:30 WARN TaskMemoryManager: Failed to allocate a page (33554432 bytes), try again.
25/08/27 11:12:32 WARN TaskMemoryManager: Failed to allocate a page (33554432 bytes), try again.
25/08/27 11:12:33 WARN TaskMemoryManager: Failed to allocate a page (33554432 bytes), try again.
25/08/27 11:12:34 WARN TaskMemoryManager: Failed to allocate a page (33554432 bytes), try again.
25/08/27 11:12:35 WARN TaskMemoryManager: Failed to allocate a page (33554432 bytes), try again.
25/08/27 11:12:36 WARN TaskMem

+----------+-----------+--------------------+----------+--------------------+----------+--------------------+------------------+----------------+----------+---------+----------+
|Unnamed: 0|Employee_ID|           Full_Name|Department|           Job_Title| Hire_Date|            Location|Performance_Rating|Experience_Years|    Status|Work_Mode|Salary_INR|
+----------+-----------+--------------------+----------+--------------------+----------+--------------------+------------------+----------------+----------+---------+----------+
|   1750498| EMP1750499|        Cindy Hudson|        IT|   Software Engineer|2011-11-13|South Ronaldton, ...|                 5|              13|  Resigned|  On-site|    831281|
|   1571875| EMP1571876|        Dennis Black|     Sales|     Account Manager|2011-02-21|Hollandbury, Micr...|                 5|              14|    Active|  On-site|    415567|
|   1300693| EMP1300694|      Alexander Hill|     Sales|     Sales Executive|2019-05-12|Lake Cherylmouth,...| 