<a href="https://colab.research.google.com/github/margaridagomes/dataeng-basic-course/blob/main/spark_streaming/examples/example_2_rate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Usecase 2
- Reading data from "rate"
- Aggregating data by window time
- Checking results from query in memory

# Setting up PySpark

In [None]:
%pip install pyspark



In [8]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').appName('Test streaming').getOrCreate()

# Write output in memory

In [9]:
import pyspark.sql.functions as F

# read stream
stream1 = spark.readStream.format("rate").option("rowsPerSecond", 10).load()

# transform
transformed = stream1.withColumn("minute", F.minute("timestamp"))
agg = transformed.groupBy(F.window(transformed.timestamp, "5 seconds")).count()

# write stream in memory
query = (agg.writeStream
.format('memory')
.queryName('my_query')
.outputMode('complete')
.start()
#.awaitTermination(10) #definir um tempo para esperar o processamento, neste caso 10s, o streaming fica bloqueado
)

In [19]:
query.isActive

True

In [22]:
spark.sql("select * from my_query order by window desc").show(10,False)

+------------------------------------------+-----+
|window                                    |count|
+------------------------------------------+-----+
|{2025-07-05 08:36:50, 2025-07-05 08:36:55}|43   |
|{2025-07-05 08:36:45, 2025-07-05 08:36:50}|50   |
|{2025-07-05 08:36:40, 2025-07-05 08:36:45}|50   |
|{2025-07-05 08:36:35, 2025-07-05 08:36:40}|50   |
|{2025-07-05 08:36:30, 2025-07-05 08:36:35}|50   |
|{2025-07-05 08:36:25, 2025-07-05 08:36:30}|50   |
|{2025-07-05 08:36:20, 2025-07-05 08:36:25}|50   |
|{2025-07-05 08:36:15, 2025-07-05 08:36:20}|50   |
|{2025-07-05 08:36:10, 2025-07-05 08:36:15}|50   |
|{2025-07-05 08:36:05, 2025-07-05 08:36:10}|50   |
+------------------------------------------+-----+
only showing top 10 rows



In [23]:
query.stop()

# Write output as json

In [24]:
!rm -rf content/output

In [25]:
import pyspark.sql.functions as F
from pyspark.sql import DataFrame

def save_parquet(df, batch_id):
  (df
   .withColumn("batch_id",F.lit(batch_id))
   .withColumn("load_time",F.current_timestamp())
   .write.mode("append")
   .parquet("content/output/rate_parquet")
  )

# read stream
stream1 = spark.readStream.format("rate").option("rowsPerSecond", 10).load()

# transform
transformed = stream1.withWatermark("timestamp", "5 seconds").withColumn("minute", F.minute("timestamp"))
agg = transformed.groupBy(F.window(transformed.timestamp, "5 seconds")).count()

# write stream as parquet with foreachBatch
query = (agg.writeStream
.option('checkpointLocation', 'content/output/checkpoint')
.trigger(processingTime='20 seconds')
.outputMode('append')
.foreachBatch(save_parquet) #quando queremos escrever output, ou código customizado p.e. escrever em duas BDs diferentes; escreve streaming em formato batch;
.start()
)


In [27]:
result = spark.read.format("parquet").load("content/output/rate_parquet/")
result.sort(F.asc("window")).show(100, False)

+------------------------------------------+-----+--------+--------------------------+
|window                                    |count|batch_id|load_time                 |
+------------------------------------------+-----+--------+--------------------------+
|{2025-07-05 08:38:20, 2025-07-05 08:38:25}|34   |2       |2025-07-05 08:39:00.194723|
|{2025-07-05 08:38:25, 2025-07-05 08:38:30}|50   |2       |2025-07-05 08:39:00.194723|
|{2025-07-05 08:38:30, 2025-07-05 08:38:35}|50   |3       |2025-07-05 08:39:20.435223|
|{2025-07-05 08:38:35, 2025-07-05 08:38:40}|50   |3       |2025-07-05 08:39:20.435223|
|{2025-07-05 08:38:40, 2025-07-05 08:38:45}|50   |3       |2025-07-05 08:39:20.435223|
|{2025-07-05 08:38:45, 2025-07-05 08:38:50}|50   |3       |2025-07-05 08:39:20.435223|
|{2025-07-05 08:38:50, 2025-07-05 08:38:55}|50   |4       |2025-07-05 08:39:40.189194|
|{2025-07-05 08:38:55, 2025-07-05 08:39:00}|50   |4       |2025-07-05 08:39:40.189194|
|{2025-07-05 08:39:00, 2025-07-05 08:39:05}

In [29]:
query.stop()

In [30]:
query.isActive

False

# Enrich data with faker

In [31]:
!pip install faker

Collecting faker
  Downloading faker-37.4.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.4.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m33.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.4.0


In [32]:
!rm -rf content/output/events

In [33]:
import pyspark.sql.functions as F
from pyspark.sql import DataFrame
from faker import Faker

def insert_into_table(df, batch_id):
  fake = Faker()
  new_columns = {
      'name': F.lit(fake.name()),
      'address': F.lit(fake.address()),
      'email': F.lit(fake.email()),
      'dob': F.lit(fake.date_of_birth()),
      'phone': F.lit(fake.phone_number())
  }
  df = df.withColumns(new_columns)
  df.write.mode("append").format("parquet").save("content/output/events")

# read stream
df_stream = spark.readStream.format("rate").option("rowsPerSecond", 1).load()

# write stream
query = (df_stream.writeStream
.outputMode('append')
.trigger(processingTime='1 seconds')
.foreachBatch(insert_into_table)
.start()
)

In [35]:
query.stop()


In [34]:
spark.read.parquet("content/output/events").show(100, False)

+-----------------------+-----+--------------+----------------------------------------------------+----------------------------+----------+--------------------+
|timestamp              |value|name          |address                                             |email                       |dob       |phone               |
+-----------------------+-----+--------------+----------------------------------------------------+----------------------------+----------+--------------------+
|2025-07-05 08:41:02.671|1    |Ronald Wright |05864 Bishop Corners Apt. 119\nBarnesshire, PA 02462|batesjulie@example.org      |1953-11-14|809.500.1077x7499   |
|2025-07-05 08:41:08.671|7    |Holly Rocha   |35750 Krystal Meadow Suite 102\nWernerfort, NE 88466|robertomeza@example.net     |1969-04-05|001-999-437-4560    |
|2025-07-05 08:41:07.671|6    |Jason Duarte  |659 Mason Haven\nWangborough, WY 56803              |davidsonchristy@example.org |1934-04-19|+1-264-202-4012x950 |
|2025-07-05 08:41:05.671|4    |Eri

In [36]:
query.stop()