In [1]:
from pyspark.sql.session import SparkSession
import os
path = f"{os.getcwd()}/../"

In [2]:
packages = [
    "org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.1",
    "org.apache.kafka:kafka-clients:3.2.1"
]

jars = [
    f"{path}/jars/spark-sql-kafka-0-10_2.12-3.4.1.jar",
    f"{path}/jars/kafka-clients-3.5.1.jar",
    f"{path}/jars/mysql-connector-j-8.0.31.jar",
    f"{path}/jars/commons-pool2-2.11.1.jar",
    f"{path}/jars/spark-token-provider-kafka-0-10_2.12-3.4.1.jar"
]

spark = (SparkSession
         .Builder()
         .appName(name="test_kafka")
         .master("spark://spark:7077")
         .config("spark.jars", ",".join(jars))
         .config("spark.jars.packages", ",".join(packages))\
         .getOrCreate())

In [3]:
spark

In [4]:
spark.sparkContext.defaultParallelism

2

In [35]:
#spark.stop()

In [88]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, DateType, TimestampType

schema = StructType([
    StructField("bme280", StructType([
        StructField("pressure", StringType()),
        StructField("temperature", StringType()),
        StructField("humidity", StringType()),
        StructField("read_datetime", StringType())
    ])),
    StructField("picow", StructType([
        StructField("local_ip", StringType()),
        StructField("temperature", DoubleType())
    ]))
])

In [102]:
picow_df = (spark 
              .read 
              .format("kafka") 
              .option("kafka.bootstrap.servers", "kafka:9092") 
              .option("subscribe", "iot_source")
              #.option("startingOffsets", "earliest")
              #.option("endingOffsets", "latest")
              .load())

df = (picow_df
      .where("offset > 2")
      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)"))

In [109]:
from pyspark.sql.functions import lit, explode, col, from_json, regexp_replace, to_timestamp


stg_df =  (df
            .withColumn('data', from_json(col('value'), schema))
            .selectExpr("data.bme280.pressure as pressure",
                        "data.bme280.temperature as temperature",
                        "data.bme280.humidity as humidity",
                        "data.bme280.read_datetime as read_datetime",
                        "data.picow.local_ip as picow_local_ip",
                        "data.picow.temperature as picow_temperature")
            .drop("key","value"))

final_df = (stg_df
                .withColumn("pressure",regexp_replace(col("pressure"),"hPa","").cast(DoubleType()))
                .withColumn("temperature",regexp_replace(col("temperature"),"C","").cast(DoubleType()))
                .withColumn("humidity",regexp_replace(col("humidity"),"%","").cast(DoubleType()))
                .withColumn("read_datetime", to_timestamp(col("read_datetime"), "yyyy-M-d HH:mm:ss"))
           )

final_df.printSchema()
final_df.show(10,False)


root
 |-- pressure: double (nullable = true)
 |-- temperature: double (nullable = true)
 |-- humidity: double (nullable = true)
 |-- read_datetime: timestamp (nullable = true)
 |-- picow_local_ip: string (nullable = true)
 |-- picow_temperature: double (nullable = true)

+--------+-----------+--------+-------------------+--------------+-----------------+
|pressure|temperature|humidity|read_datetime      |picow_local_ip|picow_temperature|
+--------+-----------+--------+-------------------+--------------+-----------------+
|890.07  |21.92      |44.83   |2023-09-06 16:39:51|192.168.1.74  |24.24184         |
+--------+-----------+--------+-------------------+--------------+-----------------+

