In [None]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip

import ConnectionConfig as cc
cc.setupEnvironment()
spark = cc.startLocalCluster("DeltaTableEx")
spark.getActiveSession()

In [3]:
from delta import DeltaTable

# Step 1: Load the dataset into a Delta table
transaction_data_path = "./FileStore/tables/transactions.csv"  # Replace with the actual path to the transaction data CSV file
transaction_delta_path = "./spark-warehouse/transaction_data_delta"  # Replace with the actual path where you want to store the Delta table

# Persist a detlatable to disk based on the CSV transactions.csv
Read the CSV file transaction_data_path
Write it as a Delta table to transaction_delta_path
Use inferSchema option (see: https://sparkbyexamples.com/pyspark/pyspark-read-csv-file-into-dataframe/) to automatically infer the schema from the CSV file. Otherwise all columns will be of type string

In [4]:
from pyspark.sql.types import StructType
# Read the CSV data and write it as a Delta table
df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(transaction_data_path)
df.write.format("delta").mode("overwrite").save(transaction_delta_path)

# Create a DeltaTable object for the persisted transaction data
Use DetlaTable.forPath() to create a DeltaTable object for the persisted transaction data
A delta table object is needed to use functions like history() and vacuum()


In [5]:
delta_table = DeltaTable.forPath(spark, transaction_delta_path)

# Get the schema and history information for the Delta table
To get the schema get the dataframe object with toDf() and use printSchema()
Delta table has a history() method that returns a dataframe with the history of the delta table
Make sure you understand the results

In [8]:
# Get the # Ge schema and history
delta_table.toDF().printSchema()
delta_table.detail().show()
delta_table.history().show(vertical=True, truncate=False)

root
 |-- transaction_id: integer (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- purchase_date: date (nullable = true)
 |-- product_name: string (nullable = true)
 |-- quantity: integer (nullable = true)

+------+--------------------+----+-----------+--------------------+--------------------+--------------------+----------------+-----------------+--------+-----------+----------+----------------+----------------+--------------------+
|format|                  id|name|description|            location|           createdAt|        lastModified|partitionColumns|clusteringColumns|numFiles|sizeInBytes|properties|minReaderVersion|minWriterVersion|       tableFeatures|
+------+--------------------+----+-----------+--------------------+--------------------+--------------------+----------------+-----------------+--------+-----------+----------+----------------+----------------+--------------------+
| delta|128b3e62-9542-475...|NULL|       NULL|file:/C:/DevProje...|2024-09-13 1

# Create a query on the delta table to find the total number of transactions
Use toDF() to convert the delta table to a spark dataframe

In [9]:
total_transactions = delta_table.toDF().count()
print(f"Total number of transactions: {total_transactions}")


Total number of transactions: 10


# Create a view on the delta table with the name 'transactions'

In [10]:
#Create a view on the delta table
delta_table.toDF().createOrReplaceTempView("transactions")

# Update the delta table to increase the quantity of a specific product by a given value
You canw write an update statement on the delta table with spark.sql(). This is not supported without the use of delta table

In [11]:
# Step 5: Update the Delta table to increase the quantity of a specific product by a given value spark.sql
spark.sql("UPDATE transactions SET quantity = quantity + 1 WHERE product_name = 'Product A'").show()

+-----------------+
|num_affected_rows|
+-----------------+
|                4|
+-----------------+


# Consult the history of the Delta table again
Try to understand what you see
When the results are truncated, you can use the vertical option in method show() to see the full results

In [12]:
delta_table.history().show(truncate = False, vertical=True)


-RECORD 0---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 version             | 1                                                                                                                                                                                                                                                                                                                              
 timestamp           | 2024-09-13 14:54:50.321                                                                                                                                                                                                                                                                            

# Delete all transactions from customer 101
You canw write an delete statement on the delta table with spark.sql(). This is not supported without the use of delta table

In [13]:
#Delete all transactions from customer 101
spark.sql("DELETE from transactions where customer_id=101").show()

+-----------------+
|num_affected_rows|
+-----------------+
|                3|
+-----------------+


# Consult the history of the Delta table again to see what is changed

In [14]:
spark.sql("select * from transactions").show()
delta_table.history().show(truncate=False,vertical=True)

+--------------+-----------+-------------+------------+--------+
|transaction_id|customer_id|purchase_date|product_name|quantity|
+--------------+-----------+-------------+------------+--------+
|             2|        102|   2023-06-02|   Product B|       5|
|             3|        103|   2023-06-03|   Product A|       4|
|             5|        104|   2023-06-05|   Product B|       4|
|             6|        102|   2023-06-06|   Product A|       3|
|             7|        103|   2023-06-07|   Product C|       3|
|             9|        104|   2023-06-09|   Product A|       3|
|            10|        103|   2023-06-10|   Product B|       3|
+--------------+-----------+-------------+------------+--------+
-RECORD 0------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Perform an merge operation
Get info on merge operations: https://docs.delta.io/latest/delta-update.html#upsert-into-a-table-using-merge&language-sql
1. Create a new dataframe original_df that reads the original transaction data from the CSV file and create a temporary view 'original_transactions'
2. Perform a merge of  'original_transactions' into 'transactions'
3. When a match is found, update the quantity for that row to 0
4. When no match is found, insert the row from the new dataframe

In [15]:
# Step 6: Perform an upsert
original_df =spark.read.format("csv").options(header="true", inferSchema="true").load(transaction_data_path)
original_df.createOrReplaceTempView("original_transactions")

spark.sql("MERGE INTO transactions AS target \
          using original_transactions AS source ON target.transaction_id = source.transaction_id \
          WHEN MATCHED THEN UPDATE SET quantity = 0 \
          WHEN NOT MATCHED THEN INSERT *")

DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

# Show the detla table and detla table history
Is the result what you expected?

In [17]:

delta_table.toDF().show()
delta_table.history().show(vertical=True, truncate=False)


+--------------+-----------+-------------+------------+--------+
|transaction_id|customer_id|purchase_date|product_name|quantity|
+--------------+-----------+-------------+------------+--------+
|             1|        101|   2023-06-01|   Product A|       2|
|             2|        102|   2023-06-02|   Product B|       0|
|             3|        103|   2023-06-03|   Product A|       0|
|             4|        101|   2023-06-04|   Product C|       1|
|             5|        104|   2023-06-05|   Product B|       0|
|             6|        102|   2023-06-06|   Product A|       0|
|             7|        103|   2023-06-07|   Product C|       0|
|             8|        101|   2023-06-08|   Product B|       5|
|             9|        104|   2023-06-09|   Product A|       0|
|            10|        103|   2023-06-10|   Product B|       0|
+--------------+-----------+-------------+------------+--------+
-RECORD 0---------------------------------------------------------------------------------

# Perform a select on the delta table on the second version
Use option versionAsOf when reading (spark.read) the delta table from disk
You can also travel back in time with option timestampAsOf

In [20]:

df = spark.read.format("delta").option("versionAsOf", 1).load("./spark-warehouse/transaction_data_delta")
df.show()

+--------------+-----------+-------------+------------+--------+
|transaction_id|customer_id|purchase_date|product_name|quantity|
+--------------+-----------+-------------+------------+--------+
|             1|        101|   2023-06-01|   Product A|       3|
|             2|        102|   2023-06-02|   Product B|       5|
|             3|        103|   2023-06-03|   Product A|       4|
|             4|        101|   2023-06-04|   Product C|       1|
|             5|        104|   2023-06-05|   Product B|       4|
|             6|        102|   2023-06-06|   Product A|       3|
|             7|        103|   2023-06-07|   Product C|       3|
|             8|        101|   2023-06-08|   Product B|       5|
|             9|        104|   2023-06-09|   Product A|       3|
|            10|        103|   2023-06-10|   Product B|       3|
+--------------+-----------+-------------+------------+--------+
