In [0]:
from pyspark.sql import SparkSession
from delta.tables import DeltaTable

# Create a Spark session with Delta Lake support
spark = SparkSession.builder.appName("DeltaLakeDemo") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

# Create a Delta Table
data = [(1, "Alice", 50000), (2, "Bob", 60000)]
columns = ["id", "name", "salary"]

df = spark.createDataFrame(data, columns)
df.write.format("delta").mode("overwrite").save("/mnt/delta/employee")

# Read and display the table
df_delta = spark.read.format("delta").load("/mnt/delta/employee")
df_delta.show()


+---+-----+------+
| id| name|salary|
+---+-----+------+
|  1|Alice| 50000|
|  2|  Bob| 60000|
+---+-----+------+



##Demonstrate ACID Transactions

In [0]:
df = spark.read.format("delta").load("/mnt/delta/employee")
df = df.withColumn("salary", df.salary + 5000)
df.write.format("delta").mode("overwrite").save("/mnt/delta/employee")
df.show()


+---+-----+------+
| id| name|salary|
+---+-----+------+
|  1|Alice| 65000|
|  2|  Bob| 75000|
+---+-----+------+



## Example 2: Atomic Insert

In [0]:
from delta.tables import *

deltaTable = DeltaTable.forPath(spark, "/mnt/delta/employee")

deltaTable.alias("old").merge(
    spark.createDataFrame([(3, "Charlie", 70000)], ["id", "name", "salary"]).alias("new"),
    "old.id = new.id"
).whenNotMatchedInsert(values={"id": "new.id", "name": "new.name", "salary": "new.salary"}) \
.execute()
df.show()

+---+-------+------+
| id|   name|salary|
+---+-------+------+
|  3|Charlie| 75000|
|  1|  Alice| 65000|
|  2|    Bob| 75000|
+---+-------+------+



##Time Travel in Delta Lake

In [0]:
df = spark.read.format("delta").option("versionAsOf", 0).load("/mnt/delta/employee")
df.show()

+---+-----+------+
| id| name|salary|
+---+-----+------+
|  1|Alice| 50000|
|  2|  Bob| 60000|
+---+-----+------+



In [0]:
df = spark.read.format("delta").option("versionAsOf", 3).load("/mnt/delta/employee")
df.show()

+---+-------+------+
| id|   name|salary|
+---+-------+------+
|  3|Charlie| 70000|
|  1|  Alice| 60000|
|  2|    Bob| 70000|
+---+-------+------+



In [0]:
spark.sql("DESCRIBE HISTORY delta.`/tmp/delta_table`").show(truncate=False)


+-------+-------------------+----------------+--------------------------+---------+--------------------------------------+----+------------------+--------------------+-----------+-----------------+-------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+-----------------------------------+
|version|timestamp          |userId          |userName                  |operation|operationParameters                   |job |notebook          |clusterId           |readVersion|isolationLevel   |isBlindAppend|operationMetrics                                                                                                                                                                 |userMetadata|engineInfo                         |
+-------+-------------------+----------------+--------------------------+---------+--------------------------------------+

##Z-Ordering and Data Compaction

In [0]:
from delta.tables import DeltaTable

# Define Delta table path
delta_table_path = "/tmp/delta_table"

# Read the existing Delta table
df = spark.read.format("delta").load(delta_table_path)

In [0]:
# Repartition data to optimize file layout
df_repartitioned = df.repartition(4)  # Adjust partition count based on data size

# Sort the data manually by the Z-Order column (e.g., 'salary')
df_sorted = df_repartitioned.sort("salary")

# Overwrite the existing Delta table with optimized layout
df_sorted.write.format("delta").mode("overwrite").save(delta_table_path)


In [0]:
df_history = spark.sql("DESCRIBE HISTORY delta.`/tmp/delta_table`")
df_history.show(truncate=False)


+-------+-------------------+----------------+--------------------------+---------+--------------------------------------+----+------------------+--------------------+-----------+-----------------+-------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+-----------------------------------+
|version|timestamp          |userId          |userName                  |operation|operationParameters                   |job |notebook          |clusterId           |readVersion|isolationLevel   |isBlindAppend|operationMetrics                                                                                                                                                                 |userMetadata|engineInfo                         |
+-------+-------------------+----------------+--------------------------+---------+--------------------------------------+

##try with csv

###Download the CSV File

In [0]:
import requests
import os

# Define paths
url = "https://gist.githubusercontent.com/kevin336/acbb2271e66c10a5b73aacf82ca82784/raw/e38afe62e088394d61ed30884dd50a6826eee0a8/employees.csv"
local_path = "/tmp/employees.csv"  # Temporary local file
dbfs_path = "dbfs:/tmp/employees.csv"  # DBFS path

# Download file to local temp directory
response = requests.get(url)
with open(local_path, "wb") as f:
    f.write(response.content)

# Move file to DBFS
dbutils.fs.mv(f"file:{local_path}", dbfs_path)

print("File moved to DBFS:", dbfs_path)


File moved to DBFS: dbfs:/tmp/employees.csv


##Read the File from DBFS

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("EmployeeData").getOrCreate()

# Read CSV File from DBFS
df = spark.read.csv("dbfs:/tmp/employees.csv", header=True, inferSchema=True)
df.show(10)


+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|    JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+
|        198|    Donald| OConnell|DOCONNEL|650.507.9833|21-JUN-07|  SH_CLERK|  2600|            - |       124|           50|
|        199|   Douglas|    Grant|  DGRANT|650.507.9844|13-JAN-08|  SH_CLERK|  2600|            - |       124|           50|
|        200|  Jennifer|   Whalen| JWHALEN|515.123.4444|17-SEP-03|   AD_ASST|  4400|            - |       101|           10|
|        201|   Michael|Hartstein|MHARTSTE|515.123.5555|17-FEB-04|    MK_MAN| 13000|            - |       100|           20|
|        202|       Pat|      Fay|    PFAY|603.123.6666|17-AUG-05|    MK_REP|  6000|            - |       201|           20|


##Convert CSV to a Delta Table

In [0]:
from delta.tables import DeltaTable

# Define Delta Table Path
delta_path = "dbfs:/tmp/delta_employees"

# Save as Delta Table
df.write.format("delta").mode("overwrite").save(delta_path)

print("✅ Delta Table Created at:", delta_path)


✅ Delta Table Created at: dbfs:/tmp/delta_employees


##Enable ACID Transactions (Update & Delete)

In [0]:
# Load Delta Table
delta_table = DeltaTable.forPath(spark, delta_path)

# Update Transaction: Increase Salary by 10% for employees in 'IT_PROG' job
delta_table.update(
    condition="JOB_ID = 'IT_PROG'",
    set={"SALARY": "SALARY * 1.1"}
)

#Delete Transaction: Remove employees with no department
delta_table.delete(condition="DEPARTMENT_ID IS NULL")

print("✅ ACID Transactions Applied: Update & Delete")



✅ ACID Transactions Applied: Update & Delete


In [0]:
# Read the updated Delta table
df_updated = spark.read.format("delta").load(delta_path)
df_updated.show(20)


+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|    JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+
|        198|    Donald| OConnell|DOCONNEL|650.507.9833|21-JUN-07|  SH_CLERK|  2600|            - |       124|           50|
|        199|   Douglas|    Grant|  DGRANT|650.507.9844|13-JAN-08|  SH_CLERK|  2600|            - |       124|           50|
|        200|  Jennifer|   Whalen| JWHALEN|515.123.4444|17-SEP-03|   AD_ASST|  4400|            - |       101|           10|
|        201|   Michael|Hartstein|MHARTSTE|515.123.5555|17-FEB-04|    MK_MAN| 13000|            - |       100|           20|
|        202|       Pat|      Fay|    PFAY|603.123.6666|17-AUG-05|    MK_REP|  6000|            - |       201|           20|


##Use Time Travel to Query Old Versions

In [0]:
# Show Delta Table History (Versions)
spark.read.format("delta").option("history", True).load(delta_path).show(truncate=False)

# Query Previous Version (Before Update)
df_old = spark.read.format("delta").option("versionAsOf", 0).load(delta_path)
df_old.show(10)

+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|EMAIL   |PHONE_NUMBER|HIRE_DATE|JOB_ID    |SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+
|198        |Donald    |OConnell |DOCONNEL|650.507.9833|21-JUN-07|SH_CLERK  |2600  | -            |124       |50           |
|199        |Douglas   |Grant    |DGRANT  |650.507.9844|13-JAN-08|SH_CLERK  |2600  | -            |124       |50           |
|200        |Jennifer  |Whalen   |JWHALEN |515.123.4444|17-SEP-03|AD_ASST   |4400  | -            |101       |10           |
|201        |Michael   |Hartstein|MHARTSTE|515.123.5555|17-FEB-04|MK_MAN    |13000 | -            |100       |20           |
|202        |Pat       |Fay      |PFAY    |603.123.6666|17-AUG-05|MK_REP    |6000  | -            |201       |20           |


##Apply Z-Ordering for Faster Queries

In [0]:
#  Perform Z-Ordering on JOB_ID for faster queries
spark.sql(f"OPTIMIZE delta.`{delta_path}` ZORDER BY JOB_ID")

print("✅ Z-Ordering Applied for Optimization")

✅ Z-Ordering Applied for Optimization


In [0]:
import time

# Run query before optimization
start_time = time.time()
spark.read.format("delta").load(delta_path).filter("JOB_ID = 'IT_PROG'").show()
print("⏳ Query Time (Before Optimization):", time.time() - start_time, "seconds")

# Run query after optimization
start_time = time.time()
spark.sql(f"SELECT * FROM delta.`{delta_path}` WHERE JOB_ID = 'IT_PROG'").show()
print("🚀 Query Time (After Optimization):", time.time() - start_time, "seconds")


+-----------+----------+---------+--------+------------+---------+-------+------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE| JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+--------+------------+---------+-------+------+--------------+----------+-------------+
|        103| Alexander|   Hunold| AHUNOLD|590.423.4567|03-JAN-06|IT_PROG|  9900|            - |       102|           60|
|        104|     Bruce|    Ernst|  BERNST|590.423.4568|21-MAY-07|IT_PROG|  6600|            - |       103|           60|
|        105|     David|   Austin| DAUSTIN|590.423.4569|25-JUN-05|IT_PROG|  5280|            - |       103|           60|
|        106|     Valli|Pataballa|VPATABAL|590.423.4560|05-FEB-06|IT_PROG|  5280|            - |       103|           60|
|        107|     Diana|  Lorentz|DLORENTZ|590.423.5567|07-FEB-07|IT_PROG|  4620|            - |       103|           60|
+-----------+----------+