<a href="https://colab.research.google.com/github/margaridagomes/dataeng-basic-course/blob/main/spark_streaming/examples/1-read_write_stream.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Read & Write Stream
- readStream()
- writeStream()
- Streaming Dataframe

# Setting up PySpark

In [None]:
%pip install pyspark



In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').appName('Test streaming').getOrCreate()

# readStream with format "rate"
- readStream
- format("rate")

In [2]:
import pyspark.sql.functions as F

# read stream
stream = spark.readStream.format("rate").load()

In [3]:
type(stream)

In [4]:
# checking if it's streaming dataframe
stream.isStreaming

True

In [5]:
# should be false
data = [("c1", "v1"), ("c2", "v2")]
columns = ["col1", "col2"]
df = spark.createDataFrame(data, columns)
df.isStreaming

False

In [6]:
# apply normal dataframe operations
stream.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- value: long (nullable = true)



In [7]:
# Queries with streaming sources must be executed with writeStream.start();
# stream.show()
stream.count()

AnalysisException: Queries with streaming sources must be executed with writeStream.start();
rate

# Transfor streaming dataframe

In [8]:
transformed = stream.withColumn("value2", F.col("value") * 2)

In [9]:
transformed.isStreaming

True

# write streaming dataframe - format memory
- writeStream
- format("memory")
- queryName
- outputMode
- start

In [10]:
query = (transformed.writeStream
  .format('memory')
  .queryName('rate_report')
  .outputMode('append')
  .start()
)

# Checking result table

In [14]:
# StreamingQuery
type(query)

In [13]:
print(spark.table("rate_report").count())
spark.table("rate_report").show(20, False)

# one line per second

54
+-----------------------+-----+------+
|timestamp              |value|value2|
+-----------------------+-----+------+
|2025-06-28 14:25:07.382|0    |0     |
|2025-06-28 14:25:08.382|1    |2     |
|2025-06-28 14:25:09.382|2    |4     |
|2025-06-28 14:25:10.382|3    |6     |
|2025-06-28 14:25:11.382|4    |8     |
|2025-06-28 14:25:12.382|5    |10    |
|2025-06-28 14:25:13.382|6    |12    |
|2025-06-28 14:25:14.382|7    |14    |
|2025-06-28 14:25:15.382|8    |16    |
|2025-06-28 14:25:16.382|9    |18    |
|2025-06-28 14:25:17.382|10   |20    |
|2025-06-28 14:25:18.382|11   |22    |
|2025-06-28 14:25:19.382|12   |24    |
|2025-06-28 14:25:20.382|13   |26    |
|2025-06-28 14:25:21.382|14   |28    |
|2025-06-28 14:25:22.382|15   |30    |
|2025-06-28 14:25:23.382|16   |32    |
|2025-06-28 14:25:24.382|17   |34    |
|2025-06-28 14:25:25.382|18   |36    |
|2025-06-28 14:25:26.382|19   |38    |
+-----------------------+-----+------+
only showing top 20 rows



In [15]:
query.status

{'message': 'Waiting for data to arrive',
 'isDataAvailable': False,
 'isTriggerActive': False}

In [16]:
query.isActive

True

In [17]:
query.recentProgress

[{'id': '637a3837-3637-4830-b12f-70aaa1ee61ed',
  'runId': '27be041d-1d86-4237-8caa-bdb57ccc7795',
  'name': 'rate_report',
  'timestamp': '2025-06-28T14:26:30.383Z',
  'batchId': 83,
  'numInputRows': 1,
  'inputRowsPerSecond': 90.90909090909092,
  'processedRowsPerSecond': 8.264462809917356,
  'durationMs': {'addBatch': 63,
   'commitOffsets': 23,
   'getBatch': 0,
   'latestOffset': 0,
   'queryPlanning': 9,
   'triggerExecution': 121,
   'walCommit': 26},
  'stateOperators': [],
  'sources': [{'description': 'RateStreamV2[rowsPerSecond=1, rampUpTimeSeconds=0, numPartitions=default',
    'startOffset': 82,
    'endOffset': 83,
    'latestOffset': 83,
    'numInputRows': 1,
    'inputRowsPerSecond': 90.90909090909092,
    'processedRowsPerSecond': 8.264462809917356}],
  'sink': {'description': 'MemorySink', 'numOutputRows': 1}},
 {'id': '637a3837-3637-4830-b12f-70aaa1ee61ed',
  'runId': '27be041d-1d86-4237-8caa-bdb57ccc7795',
  'name': 'rate_report',
  'timestamp': '2025-06-28T14:26:

In [23]:
query.lastProgress['batchId']

197

# Stop streaming

In [27]:
query.stop()

In [None]:
# awaitTermination


In [29]:
query.isActive

True

# Increase rows per second (rate)


In [28]:

# read stream
stream = spark.readStream.format("rate").option("rowsPerSecond", 20).load()

transformed = stream.withColumn("value2", F.col("value") * 2)

query = (transformed.writeStream
  .format('memory')
  .queryName('rate_report_2')
  .outputMode('append')
  .start()
)


In [31]:
print(spark.table("rate_report_2").count())
spark.table("rate_report_2").show(100, False)

840
+-----------------------+-----+------+
|timestamp              |value|value2|
+-----------------------+-----+------+
|2025-06-28 14:33:41.575|0    |0     |
|2025-06-28 14:33:41.625|1    |2     |
|2025-06-28 14:33:41.675|2    |4     |
|2025-06-28 14:33:41.725|3    |6     |
|2025-06-28 14:33:41.775|4    |8     |
|2025-06-28 14:33:41.825|5    |10    |
|2025-06-28 14:33:41.875|6    |12    |
|2025-06-28 14:33:41.925|7    |14    |
|2025-06-28 14:33:41.975|8    |16    |
|2025-06-28 14:33:42.025|9    |18    |
|2025-06-28 14:33:42.075|10   |20    |
|2025-06-28 14:33:42.125|11   |22    |
|2025-06-28 14:33:42.175|12   |24    |
|2025-06-28 14:33:42.225|13   |26    |
|2025-06-28 14:33:42.275|14   |28    |
|2025-06-28 14:33:42.325|15   |30    |
|2025-06-28 14:33:42.375|16   |32    |
|2025-06-28 14:33:42.425|17   |34    |
|2025-06-28 14:33:42.475|18   |36    |
|2025-06-28 14:33:42.525|19   |38    |
|2025-06-28 14:33:42.575|20   |40    |
|2025-06-28 14:33:42.625|21   |42    |
|2025-06-28 14:33:42.

In [38]:
query.lastProgress['sources'][0]['numInputRows']

20

In [40]:
for batch in query.recentProgress:
  print(f"timestamp - {batch['timestamp']}")
  print(f"batchId - {batch['batchId']}")
  print(f"numInputRows - {batch['numInputRows']}")
  print("--")

timestamp - 2025-06-28T14:33:41.607Z
batchId - 0
numInputRows - 0
--
timestamp - 2025-06-28T14:33:42.579Z
batchId - 1
numInputRows - 20
--
timestamp - 2025-06-28T14:33:43.583Z
batchId - 2
numInputRows - 20
--
timestamp - 2025-06-28T14:33:44.580Z
batchId - 3
numInputRows - 20
--
timestamp - 2025-06-28T14:33:45.578Z
batchId - 4
numInputRows - 20
--
timestamp - 2025-06-28T14:33:46.583Z
batchId - 5
numInputRows - 20
--
timestamp - 2025-06-28T14:33:47.581Z
batchId - 6
numInputRows - 20
--
timestamp - 2025-06-28T14:33:48.582Z
batchId - 7
numInputRows - 20
--
timestamp - 2025-06-28T14:33:49.578Z
batchId - 8
numInputRows - 20
--
timestamp - 2025-06-28T14:33:50.578Z
batchId - 9
numInputRows - 20
--
timestamp - 2025-06-28T14:33:51.575Z
batchId - 10
numInputRows - 20
--
timestamp - 2025-06-28T14:33:52.575Z
batchId - 11
numInputRows - 20
--
timestamp - 2025-06-28T14:33:53.577Z
batchId - 12
numInputRows - 20
--
timestamp - 2025-06-28T14:33:54.582Z
batchId - 13
numInputRows - 20
--
timestamp - 2025-

In [41]:
query.stop()

In [42]:
query.isActive

False