In [None]:
"""
Author: Matt Martin
Date: 2023-10-20
Desc: Delta Lake Template in jupypter nb for docker image to copy when loading for end user
"""

In [None]:
from datetime import datetime
import pyspark
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType
from delta import *



spark = configure_spark_with_delta_pip(pyspark.sql.SparkSession.builder.appName("delta")).getOrCreate()

In [None]:
def drop_table_if_exists(tbl_path) -> None:
    
    from delta import DeltaTable

    try:
        # Load the Delta table
        delta_table = DeltaTable.forPath(spark, tbl_path)

        # Delete the Delta table
        delta_table.delete()
    except Exception as e:
        pass

In [None]:
def create_or_replace_delta_table(df, tbl_path) -> None:
    try:
        df.write.format("delta").mode("overwrite").save(tbl_path)
    except Exception as e:
        df.write.format("delta").save(tbl_path)

In [None]:
def build_src_table() -> None:
    schema = StructType([
        StructField("name", StringType(), True),
        StructField("age", IntegerType(), True),
        StructField("hire_date", DateType(), True)
    ])

    data = [
        ("Matt", 20, datetime(2022,8,19)),
        ("Bill", 35, datetime(2023,4,15)),
        ("Nancy", 57, datetime(2022,4,23)),
        ("Rachel", 19, datetime(2021,6,7)),
    ]

    df = spark.createDataFrame(data, schema=schema)
    create_or_replace_delta_table(df, "/home/jovyan/src_ppl")

In [None]:
#build the delta lake table
build_src_table()

In [None]:
#read the table and display the results
src_df = spark.read.format("delta").load("/home/jovyan/src_ppl")
src_df.show()

In [None]:
## lets do some spark sql
src_df.createTempView("persons")
spark.sql("SELECT * FROM persons").show()

In [None]:
spark.sql("CREATE SCHEMA test")

In [None]:
#update some data
spark.sql("UPDATE persons set age = 20 where name = 'Rachel'")

In [None]:
#copy the table
spark.sql("CREATE OR REPLACE TABLE persons2 USING delta location '/home/jovyan/ppl2' AS SELECT * FROM persons WHERE age BETWEEN 20 and 38")

In [None]:
#delete test
spark.sql("DELETE FROM persons2 WHERE name = 'Rachel'")

In [None]:
#insert Test
sql = """
INSERT INTO persons2 (name, age, hire_date)
VALUES 
     ('Greg',42,'2023-01-01')
    ,('Adam',31,'2023-08-05')
"""
spark.sql(sql)

In [None]:
#merge test
sql = """
MERGE INTO persons as TGT
    USING persons2 AS SRC
        ON TGT.name = SRC.name
    WHEN MATCHED THEN UPDATE
        SET TGT.age = SRC.AGE, TGT.hire_date = SRC.hire_date
    WHEN NOT MATCHED THEN 
        INSERT (name, age, hire_date)
        VALUES (SRC.name, SRC.age, SRC.hire_date)
"""
spark.sql(sql)

In [None]:
#inspect updated results
spark.sql("SELECT * FROM persons").show()