# Apache Iceberg Quicksart

### Running through examples from [quickstart](https://iceberg.apache.org/spark-quickstart/)

## Initialize Spark Session

In [1]:
from pyspark.sql import SparkSession

from pyspark.sql.types import DoubleType, FloatType, LongType, StructType,StructField, StringType
spark = SparkSession.builder.appName("icerberg_qs").getOrCreate()


24/01/12 22:02:15 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


## Define empty datafame

In [2]:
schema = StructType([
  StructField("vendor_id", LongType(), True),
  StructField("trip_id", LongType(), True),
  StructField("trip_distance", FloatType(), True),
  StructField("fare_amount", DoubleType(), True),
  StructField("store_and_fwd_flag", StringType(), True)
])

df = spark.createDataFrame([], schema)
df

DataFrame[vendor_id: bigint, trip_id: bigint, trip_distance: float, fare_amount: double, store_and_fwd_flag: string]

### Create empty table with schema

In [3]:
spark.sql('CREATE DATABASE IF NOT EXISTS lakehouse.nyc;')

df.writeTo("lakehouse.nyc.taxis").create()

SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".    (0 + 1) / 1]
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
                                                                                

### Confirm table was created

In [4]:
spark.catalog.listTables('nyc')

[Table(name='taxis', catalog='lakehouse', namespace=['nyc'], description=None, tableType='MANAGED', isTemporary=False)]

### Write data to existing table

In [5]:
schema = spark.table("lakehouse.nyc.taxis").schema
data = [
    (1, 1000371, 1.8, 15.32, "N"),
    (2, 1000372, 2.5, 22.15, "N"),
    (2, 1000373, 0.9, 9.01, "N"),
    (1, 1000374, 8.4, 42.13, "Y")
  ]
df = spark.createDataFrame(data, schema)
df.writeTo("lakehouse.nyc.taxis").append()

                                                                                

### Confirm data was added

In [6]:
df = spark.table("lakehouse.nyc.taxis").show()

[Stage 4:>                                                          (0 + 1) / 1]

+---------+-------+-------------+-----------+------------------+
|vendor_id|trip_id|trip_distance|fare_amount|store_and_fwd_flag|
+---------+-------+-------------+-----------+------------------+
|        1|1000371|          1.8|      15.32|                 N|
|        2|1000372|          2.5|      22.15|                 N|
|        2|1000373|          0.9|       9.01|                 N|
|        1|1000374|          8.4|      42.13|                 Y|
+---------+-------+-------------+-----------+------------------+



                                                                                

In [7]:
spark.stop()