In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json

spark = SparkSession.builder.appName("SparkStreamingDemo").getOrCreate()

# subscribe au topic 'persons'
df_stream = spark.readStream.format("kafka") \
    .option('kafka.bootstrap.servers', 'kafka:9092') \
    .option('subscribe', 'topic-example') \
    .option('startingOffsets', 'earliest') \
    .load()

query = df_stream.writeStream.format("console").start()
query.awaitTermination()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/08 11:41:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/10/08 11:41:51 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-faced9a2-a68f-4716-9a87-c40b24a01496. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/10/08 11:41:51 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/10/08 11:41:51 WARN KafkaOffsetReaderAdmin: Error in attempt 1 getting Kafka offsets: 
java.util.concurrent.ExecutionException: org.apache.kafka.common.errors.UnknownTopicOrPartitionException: Th

StreamingQueryException: [STREAM_FAILED] Query [id = 68d80c2b-e82b-4181-bf4c-445e15d1b6ce, runId = 01902a07-8d00-4b8e-a108-66a9a5f54f7d] terminated with exception: org.apache.kafka.common.errors.UnknownTopicOrPartitionException: This server does not host this topic-partition.

# Exercice

1) Créer un topic kafka avec KafkaAdminClient.
2) Se connecter en stream sur le topic avec Spark
3) Envoyer les données présentes dans person.json dans le topic avec KafkaProducer


In [3]:
kafka_server = 'kafka:9092'

In [5]:
import json
from kafka.admin import KafkaAdminClient, NewTopic
from kafka import KafkaProducer

topic = 'persons'

admin = KafkaAdminClient(bootstrap_servers=kafka_server)
topic_list = admin.list_topics()
if topic not in topic_list:
    new_topic = NewTopic(name=topic, num_partitions=1, replication_factor=1)
    admin.create_topics(new_topics=[new_topic])

producer = KafkaProducer(bootstrap_servers=[kafka_server], value_serializer=lambda v: json.dumps(v).encode('utf-8'))

# with open('data/persons.json', 'r') as file:
#     persons = json.load(file)
#     for person in persons:
#         producer.send(topic, value=person)
#
#     producer.flush()
#
# producer.close()

In [7]:
from pyspark.sql import SparkSession
topic = 'persons'
spark = SparkSession.builder.appName("SparkStreamingDemo").getOrCreate()
df_stream = spark.readStream.format("kafka") \
    .option('kafka.bootstrap.servers', kafka_server) \
    .option('subscribe', topic) \
    .option('startingOffsets', 'earliest') \
    .load()

query = df_stream.writeStream.format("console").start()
query.awaitTermination(10)

25/10/07 11:27:05 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-08901906-c921-4254-9461-dca1adfc40ac. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/10/07 11:27:05 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+----+--------------------+-------+---------+------+--------------------+-------------+
| key|               value|  topic|partition|offset|           timestamp|timestampType|
+----+--------------------+-------+---------+------+--------------------+-------------+
|null|[7B 22 69 64 22 3...|persons|        0|     0|2025-10-07 11:27:...|            0|
|null|[7B 22 69 64 22 3...|persons|        0|     1|2025-10-07 11:27:...|            0|
|null|[7B 22 69 64 22 3...|persons|        0|     2|2025-10-07 11:27:...|            0|
|null|[7B 22 69 64 22 3...|persons|        0|     3|2025-10-07 11:27:...|            0|
|null|[7B 22 69 64 22 3...|persons|        0|     4|2025-10-07 11:27:...|            0|
|null|[7B 22 69 64 22 3...|persons|        0|     5|2025-10-07 11:27:...|            0|
|null|[7B 22 69 64 22 3...|persons|        0|     6|2025-10-07 11:27:...|            0|
|null|[7B 22 69 64 22 3

False

# Afficher uniquement la valeur (value) du message Kafka en string

In [15]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("SparkStreamingDemo").getOrCreate()
df_stream = spark.readStream.format("kafka") \
    .option('kafka.bootstrap.servers', kafka_server) \
    .option('subscribe', topic) \
    .option('startingOffsets', 'earliest') \
    .load()

# Afficher uniquement la valeur (value) du message Kafka en string
lines = df_stream.selectExpr("CAST(value AS STRING)")

query = lines.writeStream.format("console").start()
query.awaitTermination(10)

25/10/07 08:45:18 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-a288d87f-8d60-4a24-beb7-3b4353d8c1f5. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/10/07 08:45:18 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+--------------------+
|               value|
+--------------------+
|{"id": 1, "name":...|
|{"id": 2, "name":...|
|{"id": 3, "name":...|
|{"id": 4, "name":...|
|{"id": 5, "name":...|
|{"id": 6, "name":...|
|{"id": 7, "name":...|
|{"id": 8, "name":...|
|{"id": 9, "name":...|
|{"id": 10, "name"...|
|{"id": 11, "name"...|
|{"id": 12, "name"...|
|{"id": 13, "name"...|
|{"id": 14, "name"...|
|{"id": 15, "name"...|
|{"id": 16, "name"...|
|{"id": 17, "name"...|
|{"id": 18, "name"...|
|{"id": 19, "name"...|
|{"id": 20, "name"...|
+--------------------+
only showing top 20 rows



False

# Faire une execution ligne par ligne

In [8]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("SparkExample3").getOrCreate()
df_stream = spark.readStream.format("kafka") \
    .option('kafka.bootstrap.servers', kafka_server) \
    .option('subscribe', topic) \
    .option('startingOffsets', 'earliest') \
    .load()

def process_batch(batch_df, batch_id):
    batch_df.write.format("jdbc") \
        .option("url", "jdbc:postgresql://postgres:5432/events") \
        .option("dbtable", "public.raw_persons") \
        .option("user", "app") \
        .option("password", "1234") \
        .mode("append") \
        .save()

query = df_stream.writeStream \
    .foreachBatch(process_batch) \
    .start()
query.awaitTermination(10)
query.stop()

25/10/07 11:27:59 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
25/10/07 11:27:59 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-2ca5e3b1-5a11-45ab-a43b-f81f910bb48d. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/10/07 11:27:59 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
                                                                                

# Formater les données JSON reçues

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

spark = SparkSession.builder.appName("SparkExample3").getOrCreate()
df_stream = spark.readStream.format("kafka") \
    .option('kafka.bootstrap.servers', kafka_server) \
    .option('subscribe', topic) \
    .option('startingOffsets', 'earliest') \
    .load()

schema = StructType([
    StructField("id", IntegerType(), False),
    StructField("name", StringType(), False),
    StructField("age", IntegerType(), False),
    StructField("city", StringType(), False)
])

lines = df_stream.selectExpr("CAST(value AS STRING) as json_str") \
    .select(from_json(col("json_str"), schema).alias("data")) \
    .select("data.*")

def process_batch(batch_df, batch_id):
    batch_df.write.format("jdbc") \
        .option("url", "jdbc:postgresql://postgres:5432/events") \
        .option("dbtable", "public.persons") \
        .option("user", "app") \
        .option("password", "1234") \
        .mode("append") \
        .save()

query = lines.writeStream \
    .foreachBatch(process_batch) \
    .start()
query.awaitTermination(10)
query.stop()

25/10/07 13:10:34 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-6b41a4b4-7697-4fc3-a488-bf37aa34b61f. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/10/07 13:10:34 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
                                                                                

# Envoie de donnée sur Kafka

In [17]:
silver_to_kafka = lines.select(col('id').cast('string').alias('value')) \
    .writeStream.format('kafka') \
    .option('kafka.bootstrap.servers', 'kafka:9092') \
    .option('topic', 'persons-silver') \
    .option('checkpointLocation', '/tmp/checkpoint') \
    .start()

silver_to_kafka.awaitTermination(10)

25/10/07 13:17:22 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/10/07 13:17:23 WARN NetworkClient: [Producer clientId=producer-1] Error while fetching metadata with correlation id 1 : {persons-silver=LEADER_NOT_AVAILABLE}
                                                                                

False

# Read database en batch (non-stream)


In [18]:
df = spark.read.format('jdbc') \
    .option('url', 'jdbc:postgresql://postgres:5432/events') \
    .option('dbtable', 'public.persons') \
    .option('user', 'app') \
    .option('password', '1234') \
    .load()


df.show(10)

+---+---------+---+----------+
| id|     name|age|      city|
+---+---------+---+----------+
|  1|    Alice| 30|     Paris|
|  2|      Bob| 25|      Lyon|
|  3|   Céline| 35| Marseille|
|  4|    David| 28|     Paris|
|  5|     Emma| 40|  Bordeaux|
|  6| François| 22|      Nice|
|  7|Gabrielle| 31|Strasbourg|
|  8|     Hugo| 27|     Lille|
|  9|     Inès| 29|    Nantes|
| 10|   Julien| 33|  Toulouse|
+---+---------+---+----------+
only showing top 10 rows



In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("NotebookKafka") \
    .config("spark.jars.packages", "org.postgresql:postgresql:42.6.0") \
    .getOrCreate()

25/10/08 11:42:07 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [7]:
from pyspark.sql.functions import col

train_lines = spark.read.json("./data/train_lines.json")
train_lines = train_lines.drop(col('_corrupt_record')).dropna()

train_lines.show(10, truncate=False)

+----------------+-----------------+-----------+-------+---------+-----------------+
|avg_duration_min|destination      |distance_km|line_id|line_name|origin           |
+----------------+-----------------+-----------+-------+---------+-----------------+
|62              |Liège-Guillemins |98         |1      |IC-01    |Bruxelles-Midi   |
|55              |Namur            |64         |2      |IC-02    |Bruxelles-Central|
|49              |Gand-Saint-Pierre|56         |3      |IC-03    |Anvers-Central   |
|57              |Charleroi-Sud    |61         |4      |IC-04    |Bruxelles-Nord   |
|115             |Luxembourg       |125        |5      |IC-05    |Liège-Guillemins |
|82              |Bruges           |93         |6      |IC-06    |Anvers-Central   |
|52              |Mons             |64         |7      |IC-07    |Bruxelles-Midi   |
|95              |Ostende          |120        |8      |IC-08    |Bruxelles-Midi   |
|48              |Liège-Guillemins |57         |9      |IC-09    

In [12]:
train_lines.write \
    .format("parquet") \
    .mode("append") \
    .save("hdfs://namenode:9000/user/spark/train_lines")

                                                                                

In [13]:
hdfs_df = spark.read.parquet("hdfs://namenode:9000/user/spark/train_lines")
hdfs_df.show(10, truncate=False)

+----------------+-----------------+-----------+-------+---------+-----------------+
|avg_duration_min|destination      |distance_km|line_id|line_name|origin           |
+----------------+-----------------+-----------+-------+---------+-----------------+
|62              |Liège-Guillemins |98         |1      |IC-01    |Bruxelles-Midi   |
|55              |Namur            |64         |2      |IC-02    |Bruxelles-Central|
|49              |Gand-Saint-Pierre|56         |3      |IC-03    |Anvers-Central   |
|57              |Charleroi-Sud    |61         |4      |IC-04    |Bruxelles-Nord   |
|115             |Luxembourg       |125        |5      |IC-05    |Liège-Guillemins |
|82              |Bruges           |93         |6      |IC-06    |Anvers-Central   |
|52              |Mons             |64         |7      |IC-07    |Bruxelles-Midi   |
|95              |Ostende          |120        |8      |IC-08    |Bruxelles-Midi   |
|48              |Liège-Guillemins |57         |9      |IC-09    