# Imports 

In [23]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# Setup

In [24]:
spark = (
    SparkSession.builder.appName("pyspark-notebook")
    .master("spark://spark-master:7077")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .getOrCreate()
)

In [25]:
spark._jsc.hadoopConfiguration().set("fs.s3a.access.key", "minio")
spark._jsc.hadoopConfiguration().set("fs.s3a.secret.key", "minio123")
spark._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "http://172.18.0.3:9000")

In [26]:
from delta.tables import *

# Create initial dataframe

In [27]:
df = (
    spark
    .read
    .option("header", "true")
    .csv("s3a://incoming/")
)

In [28]:
df = df.withColumn("source_filename", F.input_file_name())

In [29]:
df.show(1, truncate=False, vertical=True)

-RECORD 0----------------------------------------------------------------
 vendor_id           | 1                                                 
 pickup_datetime     | 2019-01-15 03:36:12                               
 dropoff_datetime    | 2019-01-15 03:42:19                               
 passenger_count     | 1                                                 
 pickup_location_id  | 230                                               
 dropoff_location_id | 48                                                
 fare_amount         | 6.5                                               
 source_filename     | s3a://incoming/yellow_tripdata_sample_2019_01.csv 
only showing top 1 row



In [31]:
(
    df
    .write
    .format("delta")
    .option("delta.checkpointInterval", 1)
    .save("s3a://lake/taxis")
)

                                                                                

# Check schemas

In [11]:
dl = DeltaTable.forPath(spark, "s3a://lake/taxis")

In [12]:
dl.history().show(truncate=False, vertical=True)

-RECORD 0---------------------------------------------------------------------------
 version             | 0                                                            
 timestamp           | 2023-08-13 19:01:14                                          
 userId              | null                                                         
 userName            | null                                                         
 operation           | WRITE                                                        
 operationParameters | {mode -> ErrorIfExists, partitionBy -> []}                   
 job                 | null                                                         
 notebook            | null                                                         
 clusterId           | null                                                         
 readVersion         | null                                                         
 isolationLevel      | Serializable                              

# Check data 

In [13]:
df = spark.read.format("delta").load("s3a://lake/taxis")

In [14]:
df.show()

                                                                                

+---------+-------------------+-------------------+---------------+------------------+-------------------+-----------+--------------------+
|vendor_id|    pickup_datetime|   dropoff_datetime|passenger_count|pickup_location_id|dropoff_location_id|fare_amount|     source_filename|
+---------+-------------------+-------------------+---------------+------------------+-------------------+-----------+--------------------+
|        1|2019-01-15 03:36:12|2019-01-15 03:42:19|              1|               230|                 48|        6.5|s3a://incoming/ye...|
|        1|2019-01-25 18:20:32|2019-01-25 18:26:55|              1|               112|                112|        6.0|s3a://incoming/ye...|
|        1|2019-01-05 06:47:31|2019-01-05 06:52:19|              1|               107|                  4|        6.0|s3a://incoming/ye...|
|        1|2019-01-09 15:08:02|2019-01-09 15:20:17|              1|               143|                158|       11.0|s3a://incoming/ye...|
|        1|2019-01-2