<a href="https://colab.research.google.com/github/margaridagomes/dataeng-basic-course/blob/main/spark_streaming/examples/1-read_write_stream.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Read & Write Stream
- readStream()
- writeStream()
- Streaming Dataframe

# Setting up PySpark

In [None]:
%pip install pyspark



In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').appName('Test streaming').getOrCreate()

# readStream with format "rate"
- readStream
- format("rate")

In [2]:
import pyspark.sql.functions as F

# read stream
stream = spark.readStream.format("rate").load()

In [3]:
type(stream)

In [42]:
# checking if it's streaming dataframe
stream.isStreaming

True

In [6]:
# should be false
data = [("c1", "v1"), ("c2", "v2")]
columns = ["col1", "col2"]
df = spark.createDataFrame(data, columns)
df.isStreaming

False

In [43]:
# apply normal dataframe operations
stream.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- value: long (nullable = true)



In [48]:
q1= (stream.writeStream
          .format('memory')
          .queryName('rate_q1')
          .outputMode('append')
          .start())

IllegalArgumentException: Cannot start query with name rate_q1 as a query with that name is already active in this SparkSession

In [49]:
# Queries with streaming sources must be executed with writeStream.start();
# stream.show()
# stream.count()
print(spark.table("rate_q1").count())
spark.table("rate_q1").show(20, False)


1300
+-----------------------+-----+
|timestamp              |value|
+-----------------------+-----+
|2025-07-11 21:33:47.305|0    |
|2025-07-11 21:33:47.355|1    |
|2025-07-11 21:33:47.405|2    |
|2025-07-11 21:33:47.455|3    |
|2025-07-11 21:33:47.505|4    |
|2025-07-11 21:33:47.555|5    |
|2025-07-11 21:33:47.605|6    |
|2025-07-11 21:33:47.655|7    |
|2025-07-11 21:33:47.705|8    |
|2025-07-11 21:33:47.755|9    |
|2025-07-11 21:33:47.805|10   |
|2025-07-11 21:33:47.855|11   |
|2025-07-11 21:33:47.905|12   |
|2025-07-11 21:33:47.955|13   |
|2025-07-11 21:33:48.005|14   |
|2025-07-11 21:33:48.055|15   |
|2025-07-11 21:33:48.105|16   |
|2025-07-11 21:33:48.155|17   |
|2025-07-11 21:33:48.205|18   |
|2025-07-11 21:33:48.255|19   |
+-----------------------+-----+
only showing top 20 rows



# Transfor streaming dataframe

In [13]:
transformed = stream.withColumn("value2", F.col("value") * 2)

In [16]:
transformed.isStreaming

True

# write streaming dataframe - format memory
- writeStream
- format("memory")
- queryName
- outputMode
- start

In [17]:
query = (transformed.writeStream
  .format('memory')
  .queryName('rate_report')
  .outputMode('append')
  .start()
)

# Checking result table

In [18]:
# StreamingQuery
type(query)

In [19]:
print(spark.table("rate_report").count())
spark.table("rate_report").show(20, False)

# one line per second

8
+-----------------------+-----+------+
|timestamp              |value|value2|
+-----------------------+-----+------+
|2025-07-11 21:29:07.709|0    |0     |
|2025-07-11 21:29:08.709|1    |2     |
|2025-07-11 21:29:09.709|2    |4     |
|2025-07-11 21:29:10.709|3    |6     |
|2025-07-11 21:29:11.709|4    |8     |
|2025-07-11 21:29:12.709|5    |10    |
|2025-07-11 21:29:13.709|6    |12    |
|2025-07-11 21:29:14.709|7    |14    |
|2025-07-11 21:29:15.709|8    |16    |
|2025-07-11 21:29:16.709|9    |18    |
|2025-07-11 21:29:17.709|10   |20    |
|2025-07-11 21:29:18.709|11   |22    |
|2025-07-11 21:29:19.709|12   |24    |
+-----------------------+-----+------+



In [24]:
query.status

{'message': 'Waiting for data to arrive',
 'isDataAvailable': False,
 'isTriggerActive': False}

In [25]:
query.isActive

True

In [26]:
query.recentProgress

[{'id': 'dd4b6135-223c-4820-9792-42ce4d6f9729',
  'runId': 'b1894848-fb8d-467e-8dac-466963b9fa02',
  'name': 'rate_report',
  'timestamp': '2025-07-11T21:29:07.847Z',
  'batchId': 0,
  'numInputRows': 0,
  'inputRowsPerSecond': 0.0,
  'processedRowsPerSecond': 0.0,
  'durationMs': {'addBatch': 1589,
   'commitOffsets': 70,
   'getBatch': 4,
   'latestOffset': 0,
   'queryPlanning': 86,
   'triggerExecution': 1878,
   'walCommit': 98},
  'stateOperators': [],
  'sources': [{'description': 'RateStreamV2[rowsPerSecond=1, rampUpTimeSeconds=0, numPartitions=default',
    'startOffset': None,
    'endOffset': 0,
    'latestOffset': 0,
    'numInputRows': 0,
    'inputRowsPerSecond': 0.0,
    'processedRowsPerSecond': 0.0}],
  'sink': {'description': 'MemorySink', 'numOutputRows': 0}},
 {'id': 'dd4b6135-223c-4820-9792-42ce4d6f9729',
  'runId': 'b1894848-fb8d-467e-8dac-466963b9fa02',
  'name': 'rate_report',
  'timestamp': '2025-07-11T21:29:09.785Z',
  'batchId': 1,
  'numInputRows': 2,
  'inp

In [27]:
query.lastProgress['batchId']

47

# Stop streaming

In [28]:
query.stop()

In [None]:
# awaitTermination


In [29]:
query.isActive

False

In [30]:
transformed.isStreaming

True

# Increase rows per second (rate)


In [31]:

# read stream
stream = spark.readStream.format("rate").option("rowsPerSecond", 20).load()

transformed = stream.withColumn("value2", F.col("value") * 2)

query = (transformed.writeStream
  .format('memory')
  .queryName('rate_report_2')
  .outputMode('append')
  .start()
)


In [32]:
print(spark.table("rate_report_2").count())
spark.table("rate_report_2").show(100, False)

40
+-----------------------+-----+------+
|timestamp              |value|value2|
+-----------------------+-----+------+
|2025-07-11 21:31:36.466|0    |0     |
|2025-07-11 21:31:36.516|1    |2     |
|2025-07-11 21:31:36.566|2    |4     |
|2025-07-11 21:31:36.616|3    |6     |
|2025-07-11 21:31:36.666|4    |8     |
|2025-07-11 21:31:36.716|5    |10    |
|2025-07-11 21:31:36.766|6    |12    |
|2025-07-11 21:31:36.816|7    |14    |
|2025-07-11 21:31:36.866|8    |16    |
|2025-07-11 21:31:36.916|9    |18    |
|2025-07-11 21:31:36.966|10   |20    |
|2025-07-11 21:31:37.016|11   |22    |
|2025-07-11 21:31:37.066|12   |24    |
|2025-07-11 21:31:37.116|13   |26    |
|2025-07-11 21:31:37.166|14   |28    |
|2025-07-11 21:31:37.216|15   |30    |
|2025-07-11 21:31:37.266|16   |32    |
|2025-07-11 21:31:37.316|17   |34    |
|2025-07-11 21:31:37.366|18   |36    |
|2025-07-11 21:31:37.416|19   |38    |
|2025-07-11 21:31:37.466|20   |40    |
|2025-07-11 21:31:37.516|21   |42    |
|2025-07-11 21:31:37.5

In [38]:
query.lastProgress['sources'][0]['numInputRows']

20

In [39]:
for batch in query.recentProgress:
  print(f"timestamp - {batch['timestamp']}")
  print(f"batchId - {batch['batchId']}")
  print(f"numInputRows - {batch['numInputRows']}")
  print("--")

timestamp - 2025-07-11T21:31:36.512Z
batchId - 0
numInputRows - 0
--
timestamp - 2025-07-11T21:31:37.468Z
batchId - 1
numInputRows - 20
--
timestamp - 2025-07-11T21:31:38.472Z
batchId - 2
numInputRows - 20
--
timestamp - 2025-07-11T21:31:39.470Z
batchId - 3
numInputRows - 20
--
timestamp - 2025-07-11T21:31:40.473Z
batchId - 4
numInputRows - 20
--
timestamp - 2025-07-11T21:31:41.470Z
batchId - 5
numInputRows - 20
--
timestamp - 2025-07-11T21:31:42.466Z
batchId - 6
numInputRows - 20
--
timestamp - 2025-07-11T21:31:43.471Z
batchId - 7
numInputRows - 20
--
timestamp - 2025-07-11T21:31:44.475Z
batchId - 8
numInputRows - 20
--
timestamp - 2025-07-11T21:31:45.473Z
batchId - 9
numInputRows - 20
--
timestamp - 2025-07-11T21:31:46.476Z
batchId - 10
numInputRows - 20
--
timestamp - 2025-07-11T21:31:47.475Z
batchId - 11
numInputRows - 20
--


In [40]:
query.stop()

In [41]:
query.isActive

False