 

## 1. How to create and update Delta tables ?**


## **What is a Delta Table?**
A Delta Table is a table backed by the Delta Lake format, which brings transactional support to data lakes using .delta files.

In [0]:
data = [("Alice", 25), ("Bob", 30)]
df = spark.createDataFrame(data, ["Name", "age"])

# Write to Delta table
df.write.format("delta").mode("overwrite").saveAsTable("peoples")  # 

In [0]:
df = spark.read.format("delta").load("/delta/peoples")



In [0]:
df = spark.table("peoples")
df.show()

+-----+---+
| Name|age|
+-----+---+
|Alice| 25|
|  Bob| 30|
+-----+---+



In [0]:
%sql
UPDATE people
SET Age = Age + 1
WHERE Name = 'Alice';

num_affected_rows
0


In [0]:
from delta.tables import DeltaTable

# Load the Delta table
delta_table = DeltaTable.forPath(spark, "/delta/peoples")

# Update conditionally
delta_table.update(
    condition="Name = 'Alice'",
    set={"Age": "Age + 1"}
)

In [0]:
%sql
INSERT INTO peoples VALUES ('David', 40);

num_affected_rows,num_inserted_rows
1,1


In [0]:
%sql
DELETE FROM peoples WHERE Name = 'Bob';

num_affected_rows
1


In [0]:
%sql
DESCRIBE HISTORY people;

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
3,2025-06-10T15:39:17.000+0000,3303542899615855,21embit039@mlvti.ac.in,DELETE,"Map(predicate -> [""(Name#2578 = Bob)""])",,List(3643057974841132),0610-153151-x7emzcp6,2.0,WriteSerializable,False,"Map(numRemovedFiles -> 0, numRemovedBytes -> 0, numCopiedRows -> 0, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 887, numDeletedRows -> 0, scanTimeMs -> 887, numAddedFiles -> 0, numAddedBytes -> 0, rewriteTimeMs -> 0)",,Databricks-Runtime/12.2.x-scala2.12
2,2025-06-10T15:38:34.000+0000,3303542899615855,21embit039@mlvti.ac.in,WRITE,"Map(mode -> Append, partitionBy -> [])",,List(3643057974841132),0610-153151-x7emzcp6,1.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 1, numOutputBytes -> 819)",,Databricks-Runtime/12.2.x-scala2.12
1,2025-06-10T15:36:16.000+0000,3303542899615855,21embit039@mlvti.ac.in,UPDATE,"Map(predicate -> [""(Name#1147 = Alice)""])",,List(3643057974841132),0610-153151-x7emzcp6,0.0,WriteSerializable,False,"Map(numRemovedFiles -> 0, numRemovedBytes -> 0, numCopiedRows -> 0, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 876, scanTimeMs -> 837, numAddedFiles -> 0, numUpdatedRows -> 0, numAddedBytes -> 0, rewriteTimeMs -> 0)",,Databricks-Runtime/12.2.x-scala2.12
0,2025-06-10T15:35:48.000+0000,3303542899615855,21embit039@mlvti.ac.in,CREATE TABLE,"Map(isManaged -> true, description -> null, partitionBy -> [], properties -> {})",,List(3643057974841132),0610-153151-x7emzcp6,,WriteSerializable,True,Map(),,Databricks-Runtime/12.2.x-scala2.12


## **2. Write and update records in a Delta table.**

In [0]:
# Create a sample DataFrame
data = [("Alice", 25), ("Bob", 30), ("Charlie", 28)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)

# Write as Delta table (managed table)
df.write.format("delta").saveAsTable("student")


In [0]:
spark.table("student").show()

+-------+---+
|   Name|Age|
+-------+---+
|Charlie| 28|
|  Alice| 25|
|    Bob| 30|
+-------+---+



In [0]:
%sql
-- Update Bob's age
UPDATE student
SET Age = 31
WHERE Name = 'Bob';


num_affected_rows
1


In [0]:
spark.sql("SELECT * FROM student").show()

+-------+---+
|   Name|Age|
+-------+---+
|Charlie| 28|
|  Alice| 25|
|    Bob| 31|
+-------+---+



## **3. Write a simple ETL pipeline using select, filter, and groupBy.**

In [0]:
df = spark.read.option("header", True).option("inferSchema", True).csv("/FileStore/tables/employees.csv")

# Show the raw data
df.show()

+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|    JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+
|        198|    Donald| OConnell|DOCONNEL|650.507.9833|21-JUN-07|  SH_CLERK|  2600|            - |       124|           50|
|        199|   Douglas|    Grant|  DGRANT|650.507.9844|13-JAN-08|  SH_CLERK|  2600|            - |       124|           50|
|        200|  Jennifer|   Whalen| JWHALEN|515.123.4444|17-SEP-03|   AD_ASST|  4400|            - |       101|           10|
|        201|   Michael|Hartstein|MHARTSTE|515.123.5555|17-FEB-04|    MK_MAN| 13000|            - |       100|           20|
|        202|       Pat|      Fay|    PFAY|603.123.6666|17-AUG-05|    MK_REP|  6000|            - |       201|           20|


In [0]:
selected_df = df.select("FIRST_NAME", "DEPARTMENT_ID", "SALARY")

In [0]:
filtered_df = selected_df.filter("Salary > 5000")


In [0]:
avg_salary_df = filtered_df.groupBy("DEPARTMENT_ID").avg("SALARY") \
    .withColumnRenamed("avg(SALARY)", "AVG_SALARY") \
    .orderBy("DEPARTMENT_ID")

In [0]:
avg_salary_df.show()
avg_salary_df.show()



+-------------+------------------+
|DEPARTMENT_ID|        AVG_SALARY|
+-------------+------------------+
|           20|            9500.0|
|           30|           11000.0|
|           40|            6500.0|
|           50|            7280.0|
|           60|            7500.0|
|           70|           10000.0|
|           90|19333.333333333332|
|          100| 8601.333333333334|
|          110|           10154.0|
+-------------+------------------+



In [0]:
avg_salary_df.write.format("delta").mode("overwrite").saveAsTable("department_salary")


In [0]:
spark.sql("SELECT * FROM department_salary").show()

+-------------+------------------+
|DEPARTMENT_ID|        AVG_SALARY|
+-------------+------------------+
|           20|            9500.0|
|           30|           11000.0|
|           40|            6500.0|
|           50|            7280.0|
|           60|            7500.0|
|           70|           10000.0|
|           90|19333.333333333332|
|          100| 8601.333333333334|
|          110|           10154.0|
+-------------+------------------+

