In [None]:
from delta.tables import *
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, array, ArrayType, DateType, TimestampType, FloatType
from pyspark.sql.functions import *

In [None]:

SALES_ORDERS_PATH="Files/bronze/sales/store_orders/2024/04/15/17/"

SALES_ORDERS_SCHEMA =[
    ('Op', StringType()),
    ('order_number', IntegerType()),
    ('customer_id', IntegerType()),
    ('product_id', IntegerType()),
    ('order_date', StringType()),    
    ('units', IntegerType()),
    ('sale_price', FloatType()),
    ('currency', StringType()),
    ('order_mode', StringType())
]

fields = [StructField(*field) for field in SALES_ORDERS_SCHEMA]
schema = StructType(fields)
df_read_data_incremental = spark.read                          \
                                .option("header", "true")      \
                                .csv(SALES_ORDERS_PATH, schema=schema)

display(df_read_data_incremental)
df_read_data_incremental.printSchema()

In [None]:
df_read_data_incremental = spark.read                             \
                                .option("header", "true")         \
                                .option("inferSchema", "true")    \
                                .csv(SALES_ORDERS_PATH)

display(df_read_data_incremental)
df_read_data_incremental.printSchema()

In [None]:
df_read_data_incremental = df_read_data_incremental.withColumn("order_date", to_date(df_read_data_incremental.order_date,  'MM/dd/yyyy'))
df_read_data_incremental = df_read_data_incremental.withColumn("updated_at", to_timestamp(df_read_data_incremental.updated_at,  'yyyy-MM-dd HH:mm:ss'))
display(df_read_data_incremental)
df_read_data_incremental.printSchema()

In [None]:
def merge_to_delta(SALES_ORDERS_PATH):
    try:
        deltaTable = DeltaTable.forPath(spark, "Tables/temp_store_orders")
        if deltaTable:
            print("Delta table exists")
            df_read_data_incremental = spark.read                             \
                                            .option("header", "true")         \
                                            .option("inferSchema", "true")    \
                                            .csv(SALES_ORDERS_PATH)
            df_read_data_incremental = df_read_data_incremental.withColumn("order_date", to_date(df_read_data_incremental.order_date,  'MM/dd/yyyy'))
            df_read_data_incremental = df_read_data_incremental.withColumn("updated_at", to_timestamp(df_read_data_incremental.updated_at,  'yyyy-MM-dd HH:mm:ss'))
            display(df_read_data_incremental)
            deltaTable.alias("store_orders").merge(
            df_read_data_incremental.alias("store_orders_incremental"),
                    "store_orders.order_number = store_orders_incremental.order_number")                     \
                    .whenMatchedUpdate(set = {"order_number":     "store_orders_incremental.order_number",   \
                                              "customer_id":      "store_orders_incremental.customer_id",    \
                                              "product_id":       "store_orders_incremental.product_id",     \
                                              "order_date":       "store_orders_incremental.order_date",     \
                                              "units":            "store_orders_incremental.units",          \
                                              "sale_price":       "store_orders_incremental.sale_price",     \
                                              "currency":         "store_orders_incremental.currency",       \
                                              "order_mode":       "store_orders_incremental.order_mode",     \
                                              "updated_at":       "store_orders_incremental.updated_at"} )   \
                    .whenNotMatchedInsert(values =                                                           \
                       {                                                    
                                              "order_number":   "store_orders_incremental.order_number",     \
                                              "customer_id":      "store_orders_incremental.customer_id",    \
                                              "product_id":       "store_orders_incremental.product_id",     \
                                              "order_date":       "store_orders_incremental.order_date",     \
                                              "units":            "store_orders_incremental.units",          \
                                              "sale_price":       "store_orders_incremental.sale_price",     \
                                              "currency":         "store_orders_incremental.currency",       \
                                              "order_mode":       "store_orders_incremental.order_mode",     \
                                              "updated_at":       "store_orders_incremental.updated_at"      \
                       }                                                                                     \
                     ).execute()
    except:
        print("Delta table does not exist")
        df_read_data_full = spark.read                          \
                                 .option("header", "true")      \
                                 .option("inferSchema", "true") \
                                 .csv(SALES_ORDERS_PATH)
        
        df_read_data_full = df_read_data_full.withColumn("order_date", to_date(df_read_data_full.order_date,  'MM/dd/yyyy'))
        df_read_data_full = df_read_data_full.withColumn("updated_at", lit(current_timestamp()))
        PARTITION_COLUMN="currency"
        df_read_data_full.write.format("delta").partitionBy(PARTITION_COLUMN).saveAsTable("temp_store_orders")
        display(df_read_data_full)

In [None]:
merge_to_delta(SALES_ORDERS_PATH)

In [None]:
%%sql		
SELECT * FROM temp_store_orders;

In [None]:
%%sql
DESCRIBE temp_store_orders;

In [None]:
%%sql
DESCRIBE HISTORY temp_store_orders;

In [None]:
%%sql
SELECT * FROM temp_store_orders WHERE order_number=5; 

In [None]:
%%sql
UPDATE temp_store_orders SET sale_price=90.50 WHERE order_number=5;

In [None]:
%%sql
SELECT * FROM temp_store_orders WHERE order_number=5;


In [None]:
%%sql
DESCRIBE HISTORY temp_store_orders;


In [None]:
%%sql
SELECT * FROM temp_store_orders VERSION AS OF 0 WHERE order_number=5;

In [None]:
%%sql
DELETE FROM temp_store_orders WHERE order_number=5;
SELECT * FROM temp_store_orders WHERE order_number=5;


In [None]:
%%sql
DESCRIBE HISTORY temp_store_orders;

In [None]:
%%sql
RESTORE TABLE temp_store_orders TO VERSION AS OF 1;
SELECT * FROM temp_store_orders WHERE order_number=5; 

In [None]:
%%sql
DESCRIBE HISTORY temp_store_orders;


In [None]:
%%sql
SELECT count(*) FROM temp_store_orders;

In [None]:
SALES_ORDERS_PATH="Files/bronze/sales/store_orders/2024/04/15/19/"
merge_to_delta(SALES_ORDERS_PATH)


In [None]:
%%sql
SELECT count(*) FROM temp_store_orders;
SELECT * FROM temp_store_orders WHERE order_number IN (500, 1254, 1501, 2234, 2345);


In [None]:
%%sql
DESCRIBE HISTORY temp_store_orders;


In [None]:
SALES_ORDERS_PATH="Files/bronze/sales/store_orders/2024/04/15/20/"
merge_to_delta(SALES_ORDERS_PATH)


In [None]:
%%sql
SELECT * FROM temp_store_orders WHERE order_number IN (500, 1254, 1501, 2234, 2345);

In [None]:
%%sql
DESCRIBE HISTORY temp_store_orders;

In [None]:
FILE_PATH_WITH_NEW_SCHEMA="Files/schema_change.csv"

df_read_data_schema_change = spark.read                           \
                                .option("header", "true")         \
                                .option("inferSchema", "true")    \
                                .csv(FILE_PATH_WITH_NEW_SCHEMA)
df_read_data_schema_change = df_read_data_schema_change.withColumn("order_date", to_date(df_read_data_schema_change.order_date,  'MM/dd/yyyy'))
#df_read_data_schema_change = df_read_data_schema_change.withColumn("updated_at", to_timestamp(df_read_data_schema_change.updated_at,  'yyyy-MM-dd HH:mm:ss'))
display(df_read_data_schema_change)
df_read_data_schema_change.printSchema()

In [None]:
deltaTable = deltaTable = DeltaTable.forPath(spark, "Tables/temp_store_orders")

In [None]:
df_read_data_schema_change.write.format("delta").mode("append").save("Tables/temp_store_orders")

In [None]:
df_read_data_schema_change.write.format("delta").mode("append").option("mergeSchema", "true").save("Tables/temp_store_orders")


In [None]:
%%sql
SELECT * FROM temp_store_orders;

In [None]:
%%sql
DROP TABLE temp_store_orders;