In [5]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

# Inisialisasi Spark dengan Konfigurasi Iceberg
spark = (SparkSession.builder
    .appName("Load-parquet-to-Iceberg")
    .master("spark://spark-nb:7077")
    # 1. Aktifkan Extensions Iceberg
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
    # 2. Konfigurasi Katalog bernama 'iceberg'
    .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog")
    .config("spark.sql.catalog.iceberg.type", "hive")
    .config("spark.sql.catalog.iceberg.uri", "thrift://hive-metastore:9083")
    .config("spark.sql.catalog.iceberg.warehouse", "s3a://iceberg/lakehouse")
    .config("spark.sql.catalog.iceberg.io-impl", "org.apache.iceberg.hadoop.HadoopFileIO")    
    # 3. Konfigurasi S3 (MinIO)
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
    .config("spark.hadoop.fs.s3a.access.key", "minio")
    .config("spark.hadoop.fs.s3a.secret.key", "minio123")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
    .config("spark.driver.host", "spark-nb")
    .getOrCreate())

In [None]:
!curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet -o /tmp/yellow_tripdata_2023-01.parquet

In [6]:
df = spark.read.parquet("/tmp/yellow_tripdata_2023-01.parquet")

df.printSchema()
df.show(5)

                                                                                

root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)



                                                                                

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       2| 2023-01-01 00:32:10|  2023-01-01 00:40:36|            1.0|         0.97|       1.0|                 N|         161|         141|           2|        9.3|  1.0|    0.5|       0.

In [7]:
# Sekarang Anda bisa menjalankan:
spark.sql("CREATE NAMESPACE IF NOT EXISTS bronze")

DataFrame[]

In [8]:
# Verifikasi
spark.sql("SHOW NAMESPACES").show()

+---------+
|namespace|
+---------+
|   bronze|
|  default|
+---------+



In [11]:
# Gunakan writeTo dengan properti 'location'
df.writeTo("iceberg.bronze.ny_taxi_2023") \
    .tableProperty("location", "s3a://iceberg/lakehouse/bronze/ny_taxi_2023") \
    .tableProperty("write.format.default", "parquet") \
    .createOrReplace()

print("✅ Berhasil! Data sekarang tersimpan di MinIO sebagai tabel Iceberg.")

25/12/23 02:32:55 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


✅ Berhasil! Data sekarang tersimpan di MinIO sebagai tabel Iceberg.


In [12]:
spark.stop()