In [1]:
from pyspark.sql import SparkSession

spark_conf = {
    "spark.jars.packages": "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.6.1",
    "spark.sql.catalog.spark_catalog": "org.apache.iceberg.spark.SparkSessionCatalog",
    "spark.sql.catalog.spark_catalog.types": "hive"
}

spark = SparkSession \
    .builder \
    .appName("spark-nb") \
    .master("spark://spark-master:7077") \
    .config(map=spark_conf) \
    .enableHiveSupport() \
    .getOrCreate()

In [2]:
df = spark.read.format("parquet").load("s3a://raw-data/")

In [3]:
df.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- lpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- lpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- ehail_fee: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- trip_type: long (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



In [4]:
df.writeTo("local_db.sample_iceberg_table").using("iceberg").create()

In [5]:
print(spark.sql("SHOW CREATE TABLE local_db.sample_iceberg_table").collect()[0].createtab_stmt)

CREATE TABLE spark_catalog.local_db.sample_iceberg_table (
  VendorID INT,
  lpep_pickup_datetime TIMESTAMP_NTZ,
  lpep_dropoff_datetime TIMESTAMP_NTZ,
  store_and_fwd_flag STRING,
  RatecodeID BIGINT,
  PULocationID INT,
  DOLocationID INT,
  passenger_count BIGINT,
  trip_distance DOUBLE,
  fare_amount DOUBLE,
  extra DOUBLE,
  mta_tax DOUBLE,
  tip_amount DOUBLE,
  tolls_amount DOUBLE,
  ehail_fee DOUBLE,
  improvement_surcharge DOUBLE,
  total_amount DOUBLE,
  payment_type BIGINT,
  trip_type BIGINT,
  congestion_surcharge DOUBLE)
USING iceberg
LOCATION 's3a://spark-warehouse/hive/local_db/sample_iceberg_table'
TBLPROPERTIES (
  'current-snapshot-id' = '567489935029930244',
  'format' = 'iceberg/parquet',
  'format-version' = '2',
  'write.parquet.compression-codec' = 'zstd')



In [6]:
spark.sql("SELECT * FROM local_db.sample_iceberg_table LIMIT 5").toPandas()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2024-03-01 00:10:52,2024-03-01 00:26:12,N,1,129,226,1,1.72,12.8,1.0,0.5,3.06,0.0,,1.0,18.36,1,1,0.0
1,2,2024-03-01 00:22:21,2024-03-01 00:35:15,N,1,130,218,1,3.25,17.7,1.0,0.5,0.0,0.0,,1.0,20.2,2,1,0.0
2,2,2024-03-01 00:45:27,2024-03-01 01:04:32,N,1,255,107,2,4.58,23.3,1.0,0.5,3.5,0.0,,1.0,32.05,1,1,2.75
3,1,2024-03-01 00:02:00,2024-03-01 00:23:45,N,1,181,71,1,0.0,22.5,0.0,1.5,0.0,0.0,,1.0,24.0,1,1,0.0
4,2,2024-03-01 00:16:45,2024-03-01 00:23:25,N,1,95,135,1,1.15,8.6,1.0,0.5,1.0,0.0,,1.0,12.1,1,1,0.0


In [7]:
spark.sql("ALTER TABLE local_db.sample_iceberg_table DROP COLUMN VendorID")

DataFrame[]

In [8]:
print(spark.sql("SHOW CREATE TABLE local_db.sample_iceberg_table").collect()[0].createtab_stmt)

CREATE TABLE spark_catalog.local_db.sample_iceberg_table (
  lpep_pickup_datetime TIMESTAMP_NTZ,
  lpep_dropoff_datetime TIMESTAMP_NTZ,
  store_and_fwd_flag STRING,
  RatecodeID BIGINT,
  PULocationID INT,
  DOLocationID INT,
  passenger_count BIGINT,
  trip_distance DOUBLE,
  fare_amount DOUBLE,
  extra DOUBLE,
  mta_tax DOUBLE,
  tip_amount DOUBLE,
  tolls_amount DOUBLE,
  ehail_fee DOUBLE,
  improvement_surcharge DOUBLE,
  total_amount DOUBLE,
  payment_type BIGINT,
  trip_type BIGINT,
  congestion_surcharge DOUBLE)
USING iceberg
LOCATION 's3a://spark-warehouse/hive/local_db/sample_iceberg_table'
TBLPROPERTIES (
  'current-snapshot-id' = '567489935029930244',
  'format' = 'iceberg/parquet',
  'format-version' = '2',
  'write.parquet.compression-codec' = 'zstd')



In [9]:
spark.stop()