In [0]:
%pip install pyspark==3.5.0 delta-spark==3.1.0


In [0]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip
builder = (
SparkSession.builder.appName("DeltaDemo")
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
.config("spark.sql.catalog.spark_catalog",
"org.apache.spark.sql.delta.catalog.DeltaCatalog")
)
spark = configure_spark_with_delta_pip(builder).getOrCreate()
# Create sample DataFrame
data = [
(1, "John", "Electronics", 2, 300),
(2, "Sara", "Clothing", 1, 50),
(3, "Mike", "Electronics", 4, 600),
(4, "Nina", "Clothing", 3, 150),
]
columns = ["order_id", "customer_name", "category", "quantity", "amount"]
df = spark.createDataFrame(data, columns)
# Save as Delta table
df.write.format("delta").mode("overwrite").save("/tmp/orders_delta")

In [0]:
# Create Managed Table
spark.sql("""
CREATE TABLE orders_managed
USING DELTA
LOCATION '/tmp/orders_delta'
""")

# Create Unmanaged Table
spark.sql("""
CREATE TABLE orders_unmanaged
USING DELTA
LOCATION '/tmp/orders_delta'
""")


In [0]:
# Read from the table
df_orders = spark.read.format("delta").load("/tmp/orders_delta")
df_orders.show()

# Update: Increase amount for 'Clothing' category by 20
spark.sql("""
UPDATE orders_managed
SET amount = amount + 20
WHERE category = 'Clothing'
""")

# Delete: Remove all orders with quantity < 2
spark.sql("""
DELETE FROM orders_managed
WHERE quantity < 2
""")

# Prepare new and updated orders
new_data = [
    (5, "Anna", "Furniture", 2, 400),
    (6, "James", "Clothing", 1, 100),
]
new_columns = ["order_id", "customer_name", "category", "quantity", "amount"]
new_df = spark.createDataFrame(new_data, new_columns)

# Merge new data into the table
new_df.write.format("delta").mode("append").save("/tmp/orders_delta")


In [0]:
# Describe the history of the Delta table
spark.sql("DESCRIBE HISTORY delta.`/tmp/orders_delta`").show()

# Read data from version 0 (initial version)
df_version_0 = spark.read.format("delta").option("versionAsOf", 0).load("/tmp/orders_delta")
df_version_0.show()

# Read data from the latest version
df_latest = spark.read.format("delta").load("/tmp/orders_delta")
df_latest.show()


In [0]:
# New batch with orders 5 and 6
new_orders_batch = [
    (5, "Anna", "Furniture", 2, 400),
    (6, "James", "Clothing", 1, 100),
]
batch_columns = ["order_id", "customer_name", "category", "quantity", "amount"]
new_batch_df = spark.createDataFrame(new_orders_batch, batch_columns)

# Append the new batch data to the Delta table
new_batch_df.write.format("delta").mode("append").save("/tmp/orders_delta")

# Show final table after the append
final_df = spark.read.format("delta").load("/tmp/orders_delta")
final_df.show()


In [0]:
import pandas as pd
import matplotlib.pyplot as plt

# Read Delta table into Pandas DataFrame
df_pandas = spark.read.format("delta").load("/tmp/orders_delta").toPandas()

# Group by category and sum the amount
category_amount = df_pandas.groupby('category')['amount'].sum().reset_index()

# Create bar chart
plt.figure(figsize=(10, 6))
plt.bar(category_amount['category'], category_amount['amount'], color='skyblue')
plt.title("Total Amount by Category")
plt.xlabel("Category")
plt.ylabel("Total Amount")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
