In [1]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder\
                    .master('spark://spark-master:7077')\
                    .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.2')\
                    .appName('spark-streaming-operation-read')\
                    .getOrCreate()
print(spark.version) # .config('spark.cores.max', 2)\


3.1.2


In [5]:
baseStream = spark.readStream.format("rate").option("recordsPerSecond", 100).load()

In [6]:
baseStream.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- value: long (nullable = true)



In [7]:
baseStream

DataFrame[timestamp: timestamp, value: bigint]

In [8]:
from pyspark.sql.types import IntegerType, StringType
from pyspark.sql.functions import udf, lit
import random 

my_udf = udf(lambda x: random.randrange(10000), StringType())

In [9]:
sensorValues = baseStream.withColumn("sensorId", my_udf(lit(10000)))\
                        .withColumn("value",baseStream.value.cast('string'))

In [10]:
sensorValues.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- value: string (nullable = true)
 |-- sensorId: string (nullable = true)



In [11]:
# test_df = sensorValues.writeStream.format("console")\
#                                 .outputMode("append")\
#                                 .option("truncate", "false")\
#                                 .start()\
#                                 .awaitTermination()

In [None]:
kafka_df = sensorValues.writeStream.format("kafka")\
                                .queryName("kafkaWriter3")\
                                .outputMode("append")\
                                .option("kafka.bootstrap.servers", "10.5.0.10:9092")\
                                .option("topic", "iot-data2")\
                                .option("checkpointLocation", "./tmp/streaming-with-spark/generator-checkpoint")\
                                .option("failOnDataLoss", "false")\
                                .start()\
                                .awaitTermination()