# Implementing Structured Streaming

### Create Sample Customer Data

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
import datetime

# Define schema
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("email", StringType(), True),
    StructField("city", StringType(), True),
    StructField("state", StringType(), True),
    StructField("dob", DateType(), True)
])

# Sample initial data
data = [
    (1, "Alice Smith", "alice@example.com", "New York", "NY", datetime.date(1990, 5, 12)),
    (2, "Bob Johnson", "bob@example.com", "Los Angeles", "CA", datetime.date(1985, 8, 20)),
    (3, "Carol Lee", "carol@example.com", "Chicago", "IL", datetime.date(1992, 2, 17))
]

# Create DataFrame
df = spark.createDataFrame(data, schema)

# Write as Delta table
df.write.format("delta").mode("overwrite").saveAsTable("customer_source_table")


### Create the streaming dataframe

In [0]:
streamDF = spark.readStream.table("customer_source_table")

### Data Transformation

Example transformations:
- Calculate Age
- Uppercase city name

In [0]:
from pyspark.sql.functions import current_date, datediff, upper

# Add computed columns
transformedDF = streamDF.withColumn(
    "age",
    (datediff(current_date(), streamDF.dob) / 365).cast("int")
).withColumn(
    "city_upper",
    upper(streamDF.city)
)

# Show the schema
transformedDF.printSchema()

### Persist the Streaming Data

In [0]:
query = transformedDF.writeStream \
    .format("delta") \
    .trigger(processingTime="5 second") \
    .outputMode("append") \
    .option("checkpointLocation", "/tmp/checkpoints/customer_query") \
    .table("customer_target_table")

In [0]:
spark.sql("SELECT * FROM customer_target_table").show()

### Append More Customer Data

In [0]:
# Additional customers
new_data = [
    (4, "David Brown", "david@example.com", "Houston", "TX", datetime.date(1988, 11, 5)),
    (5, "Eva Green", "eva@example.com", "Phoenix", "AZ", datetime.date(1995, 3, 23))
]

new_df = spark.createDataFrame(new_data, schema)

# Append to the source table
new_df.write.format("delta").mode("append").saveAsTable("customer_source_table")

In [0]:
spark.sql("SELECT * FROM customer_target_table").show()

In [0]:
for stream in spark.streams.active:
    print(stream.name, stream.id, stream.isActive)

In [0]:
query.stop()

### Clean Up

In [0]:
%sql
DROP TABLE customer_source_table;
DROP TABLE customer_target_table;

In [0]:
%fs rm -r /tmp/checkpoints/customer_query