In [1]:
from pyspark.sql import SparkSession
spark = (SparkSession.builder.appName("cs544")
         .config("spark.sql.shuffle.partitions", 10)
         .config("spark.ui.showConsoleProgress", False)
         .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.2')
         .getOrCreate())

df = (
    spark.readStream.format("kafka")
    .option("kafka.bootstrap.servers", "kafka:9092")
    .option("subscribe", "stations-json")
    .option("startingOffsets", "earliest")
    .load()
)

:: loading settings :: url = jar:file:/usr/local/lib/python3.10/dist-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-18fbcae9-5b98-47ce-a477-614ffcc4c373;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.2.2 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.2.2 in central
	found org.apache.kafka#kafka-clients;2.8.1 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.8.4 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.1 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.hadoop#hadoop-client-api;3.3.1 in central
	found org.apache.htrace#htrace-core4;4.1.0-incubating in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.ap

In [2]:
df.isStreaming

True

In [17]:
from pyspark.sql.types import StructType, StructField, StringType, DateType, DoubleType, BooleanType
from pyspark.sql.functions import col, from_json

schema = StructType([
    StructField("station", StringType()),
    StructField("date", DateType()),
    StructField("degrees", DoubleType()),
    StructField("raining", BooleanType())
])

reports = (df.select(from_json(col("value").cast("string"), schema).alias("value")).select("value.*"))

In [18]:
reports.isStreaming

True

In [19]:
reports.printSchema()

root
 |-- station: string (nullable = true)
 |-- date: date (nullable = true)
 |-- degrees: double (nullable = true)
 |-- raining: boolean (nullable = true)



In [24]:
from pyspark.sql.functions import *

counts_df = reports.groupBy("station") \
                  .agg(min("date").alias("start"),
                       max("date").alias("end"),
                       count("degrees").alias("measurements"),
                       avg("degrees").alias("avg"),
                       max("degrees").alias("max"))

In [33]:
s = counts_df.writeStream.format("console").trigger(processingTime="5 seconds").outputMode("complete").start()
s.awaitTermination(30)
s.stop()

23/04/27 00:14:34 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-d198f1c8-263d-4953-8630-6d7374de36f6. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/04/27 00:14:34 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


-------------------------------------------
Batch: 153
-------------------------------------------
+-------+----------+------------------+-------+
|station|      date|           degrees|raining|
+-------+----------+------------------+-------+
|      F|2003-03-18| 37.86342739055065|   null|
|      I|2003-03-18| 61.56180498238043|   null|
|      J|2003-03-18| 29.70340080925409|   null|
|      N|2003-03-18|58.019377660848534|   null|
|      A|2003-03-18|41.796490845216894|   null|
|      B|2003-03-18| 62.56612387245123|   null|
|      C|2003-03-18| 33.12446569069752|   null|
|      K|2003-03-18| 44.38191373841744|   null|
|      L|2003-03-18| 52.07183883643868|   null|
|      E|2003-03-18| 39.77007238422983|   null|
|      O|2003-03-18|30.011716527054602|   null|
|      D|2003-03-18| 50.83271494902297|   null|
|      G|2003-03-18| 52.88499428473371|   null|
|      M|2003-03-18| 33.88032835084895|   null|
|      H|2003-03-18|  42.3471226358732|   null|
+-------+----------+-----------------

23/04/27 00:14:40 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 5000 milliseconds, but spent 5213 milliseconds


-------------------------------------------
Batch: 157
-------------------------------------------
+-------+----------+------------------+-------+
|station|      date|           degrees|raining|
+-------+----------+------------------+-------+
|      F|2003-03-22|33.494889956693775|   null|
|      I|2003-03-22| 48.79730206960594|   null|
|      J|2003-03-22|26.186849397373038|   null|
|      N|2003-03-22| 44.27033632310416|   null|
|      A|2003-03-22|44.032732729843566|   null|
|      B|2003-03-22| 68.63313612518905|   null|
|      C|2003-03-22|23.185723840531633|   null|
|      K|2003-03-22|54.726704199112234|   null|
|      L|2003-03-22| 41.31208393751526|   null|
|      E|2003-03-22|29.204595660498747|   null|
|      O|2003-03-22|42.262047298692025|   null|
|      D|2003-03-22| 34.15144930192795|   null|
|      G|2003-03-22|57.704538511309465|   null|
|      M|2003-03-22|37.048301269742446|   null|
|      H|2003-03-22| 47.42255716739459|   null|
+-------+----------+-----------------

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

-------------------------------------------
Batch: 3
-------------------------------------------
+-------+----------+----------+------------+------------------+------------------+
|station|     start|       end|measurements|               avg|               max|
+-------+----------+----------+------------+------------------+------------------+
|      K|2000-01-01|2003-04-03|        1189|59.288560389164175|103.52487644548896|
|      F|2000-01-01|2003-04-03|        1189| 49.83530134806887| 97.49961751381443|
|      I|2000-01-01|2003-04-03|        1189|58.240916975253285|104.63674611741688|
|      N|2000-01-01|2003-04-03|        1189| 61.25277241631171|113.65827497049271|
|      E|2000-01-01|2003-04-03|        1189| 49.10149210069852|106.64713194169838|
|      J|2000-01-01|2003-04-03|        1189|46.357390454135015| 97.56021527535312|
|      A|2000-01-01|2003-04-03|        1189| 50.46573193680899| 98.12392490043354|
|      H|2000-01-01|2003-04-03|        1189|57.876013027925076| 105.50698

In [26]:
counts_df.toPandas()

AnalysisException: Queries with streaming sources must be executed with writeStream.start();
kafka

In [None]:
today=
features=