In [1]:
from pyspark.sql import SparkSession

In [2]:
# Create a Spark session with Hadoop and S3 configurations
spark = SparkSession.builder \
    .appName("DeltaLakeWithMinIO") \
    .config("spark.hadoop.fs.s3a.access.key", "minio") \
    .config("spark.hadoop.fs.s3a.secret.key", "minio123") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4") \
    .getOrCreate()


In [3]:
spark.range(100).show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
| 10|
| 11|
| 12|
| 13|
| 14|
| 15|
| 16|
| 17|
| 18|
| 19|
+---+
only showing top 20 rows



In [4]:
print(spark.sparkContext._jsc.hadoopConfiguration().get("fs.s3a.impl"))

org.apache.hadoop.fs.s3a.S3AFileSystem


In [5]:
# Define the path in MinIO where you will store the Delta table
delta_table_path = "s3a://test/delta-table"

In [6]:
# Create a simple DataFrame
data = spark.range(0, 5)

In [7]:
# Write DataFrame to Delta Lake format
data.write.mode("overwrite").save(delta_table_path)
print(f"Delta table written to {delta_table_path}")

Delta table written to s3a://test/delta-table


In [8]:
# Read the Delta table back from MinIO
delta_df = spark.read.load(delta_table_path)

# Show the contents of the Delta table
delta_df.show()

+---+
| id|
+---+
|  0|
|  1|
|  3|
|  4|
|  2|
+---+

