In [2]:
from pyspark.sql import SparkSession
import pyspark
from pyspark.sql.functions import *
spark = pyspark.sql.SparkSession.builder.appName("Product_Price_Tracking") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:0.7.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

from delta.tables import *

In [3]:
df_productsaug20 = spark.read.csv('hdfs:///delta_lake/raw/products_aug20.csv', header=True, inferSchema=True)
df_productsaug20.show()

+---------+----------+-----+
|ProductID|      Date|Price|
+---------+----------+-----+
|      200|2020-08-20| 20.5|
|      210|2020-08-20| 45.0|
|      220|2020-08-20|34.56|
|      230|2020-08-20|23.67|
|      240|2020-08-20|89.76|
+---------+----------+-----+



In [4]:
df_productsaug20.write.format("delta").option("path", "hdfs:///delta_lake/products").saveAsTable("products")

In [5]:
spark.sql("SELECT * FROM products").show()

+---------+----------+-----+
|ProductID|      Date|Price|
+---------+----------+-----+
|      200|2020-08-20| 20.5|
|      210|2020-08-20| 45.0|
|      220|2020-08-20|34.56|
|      230|2020-08-20|23.67|
|      240|2020-08-20|89.76|
+---------+----------+-----+



In [None]:
spark.sql("CREATE TABLE products USING DELTA LOCATION 'hdfs:///delta_lake/products'")
spark.sql('SELECT * FROM products').show()

In [9]:
deltaTable = DeltaTable.forPath(spark, "hdfs:///delta_lake/products")

In [10]:
deltaTable.update("ProductID = '200'", { "Price": "'48.00'" } )

In [12]:
df = spark.read.format("delta").option("versionAsOf", 1).load("hdfs:///delta_lake/products")
df.show()

+---------+----------+-----+
|ProductID|      Date|Price|
+---------+----------+-----+
|      200|2020-08-20| 48.0|
|      210|2020-08-20| 45.0|
|      220|2020-08-20|34.56|
|      230|2020-08-20|23.67|
|      240|2020-08-20|89.76|
+---------+----------+-----+



In [13]:
df = spark.read.format("delta").option("versionAsOf", 0).load("hdfs:///delta_lake/products")
df.show()

+---------+----------+-----+
|ProductID|      Date|Price|
+---------+----------+-----+
|      200|2020-08-20| 20.5|
|      210|2020-08-20| 45.0|
|      220|2020-08-20|34.56|
|      230|2020-08-20|23.67|
|      240|2020-08-20|89.76|
+---------+----------+-----+



In [14]:
deltaTable.delete("ProductID = 210") 
df = spark.read.format("delta").option("versionAsOf", 2).load("hdfs:///delta_lake/products")
df.show()

+---------+----------+-----+
|ProductID|      Date|Price|
+---------+----------+-----+
|      200|2020-08-20| 48.0|
|      220|2020-08-20|34.56|
|      230|2020-08-20|23.67|
|      240|2020-08-20|89.76|
+---------+----------+-----+



In [17]:
df_productsaug21 = spark.read.csv('hdfs:///delta_lake/raw/products_aug21.csv', header=True, inferSchema=True)
df_productsaug21.show()

+---------+----------+-----+
|ProductID|      Date|Price|
+---------+----------+-----+
|      200|2020-08-21| 25.5|
|      210|2020-08-21| 46.0|
|      220|2020-08-21|34.56|
|      230|2020-08-21|23.67|
|      240|2020-08-21|90.82|
|      250|2020-08-21|89.76|
|      260|2020-08-21|54.55|
|      270|2020-08-21|96.32|
|      280|2020-08-21|44.78|
+---------+----------+-----+



In [18]:
deltaTable.alias("products").merge(
    df_productsaug21.alias("products_new"),
                    "products.ProductID = products_new.ProductID") \
                    .whenMatchedUpdate(set = { "Price" : "products_new.Price" } ) \
                    .whenNotMatchedInsert(values =
                       {
                        "ProductID": "products_new.ProductID",
                        "Date": "products_new.Date",
                        "Price": "products_new.Price"
                       }
                     ).execute()

In [19]:
df = spark.read.format("delta").option("versionAsOf", 3).load("hdfs:///delta_lake/products")
df.show()

+---------+----------+-----+
|ProductID|      Date|Price|
+---------+----------+-----+
|      250|2020-08-21|89.76|
|      270|2020-08-21|96.32|
|      280|2020-08-21|44.78|
|      200|2020-08-20| 25.5|
|      220|2020-08-20|34.56|
|      240|2020-08-20|90.82|
|      260|2020-08-21|54.55|
|      210|2020-08-21| 46.0|
|      230|2020-08-20|23.67|
+---------+----------+-----+



In [20]:
spark.table("products").show()

+---------+----------+-----+
|ProductID|      Date|Price|
+---------+----------+-----+
|      250|2020-08-21|89.76|
|      270|2020-08-21|96.32|
|      280|2020-08-21|44.78|
|      200|2020-08-20| 25.5|
|      220|2020-08-20|34.56|
|      240|2020-08-20|90.82|
|      260|2020-08-21|54.55|
|      210|2020-08-21| 46.0|
|      230|2020-08-20|23.67|
+---------+----------+-----+



In [21]:
deltaTable.update("ProductID = '230'", { "Price": "'33.67'" } )
deltaTable.update("ProductID = '210'", { "Price": "'56.00'" } )
deltaTable.update("ProductID = '250'", { "Price": "'99.76'" } )
deltaTable.update("ProductID = '220'", { "Price": "'44.56'" } )
deltaTable.update("ProductID = '240'", { "Price": "'100.82'" } )
deltaTable.update("ProductID = '200'", { "Price": "'35.5'" } )
deltaTable.update("ProductID = '260'", { "Price": "'64.55'" } )
deltaTable.update("ProductID = '280'", { "Price": "'54.78'" } )
deltaTable.update("ProductID = '270'", { "Price": "'106.32'" } )