In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession

scala_version='2.12'
spark_version='3.5.1'

package = f"org.apache.spark:spark-sql-kafka-0-10_{scala_version}:{spark_version},org.apache.kafka:kafka-clients:3.5.0"


spark = SparkSession.builder \
    .appName("kafka-example") \
    .config("spark.jars.packages", package) \
    .config("spark.ui.showConsoleProgress", "false") \
    .getOrCreate()

spark

In [2]:
kafka_server = "kafka:9092" 
topic_name = "LeNguyenHoangPhuc_RandomNumber"         

kafkaDf = spark.read.format("kafka") \
    .option("kafka.bootstrap.servers", kafka_server) \
    .option("subscribe", topic_name) \
    .option("startingOffsets", "earliest") \
    .load()


In [3]:
kafkaDf.show()

+----+--------------------+--------------------+---------+------+--------------------+-------------+
| key|               value|               topic|partition|offset|           timestamp|timestampType|
+----+--------------------+--------------------+---------+------+--------------------+-------------+
|NULL|[7B 22 6E 75 6D 6...|LeNguyenHoangPhuc...|        0|     0|2025-10-17 16:32:...|            0|
|NULL|[7B 22 6E 75 6D 6...|LeNguyenHoangPhuc...|        0|     1|2025-10-17 16:32:...|            0|
|NULL|[7B 22 6E 75 6D 6...|LeNguyenHoangPhuc...|        0|     2|2025-10-17 16:32:...|            0|
|NULL|[7B 22 6E 75 6D 6...|LeNguyenHoangPhuc...|        0|     3|2025-10-17 16:32:...|            0|
|NULL|[7B 22 6E 75 6D 6...|LeNguyenHoangPhuc...|        0|     4|2025-10-17 16:32:...|            0|
|NULL|[7B 22 6E 75 6D 6...|LeNguyenHoangPhuc...|        0|     5|2025-10-17 16:32:...|            0|
|NULL|[7B 22 6E 75 6D 6...|LeNguyenHoangPhuc...|        0|     6|2025-10-17 16:32:...|     

In [4]:
kafkaDf.toPandas()

Unnamed: 0,key,value,topic,partition,offset,timestamp,timestampType
0,,"[123, 34, 110, 117, 109, 98, 101, 114, 34, 58,...",LeNguyenHoangPhuc_RandomNumber,0,0,2025-10-17 16:32:17.482,0
1,,"[123, 34, 110, 117, 109, 98, 101, 114, 34, 58,...",LeNguyenHoangPhuc_RandomNumber,0,1,2025-10-17 16:32:22.488,0
2,,"[123, 34, 110, 117, 109, 98, 101, 114, 34, 58,...",LeNguyenHoangPhuc_RandomNumber,0,2,2025-10-17 16:32:27.495,0
3,,"[123, 34, 110, 117, 109, 98, 101, 114, 34, 58,...",LeNguyenHoangPhuc_RandomNumber,0,3,2025-10-17 16:32:32.497,0
4,,"[123, 34, 110, 117, 109, 98, 101, 114, 34, 58,...",LeNguyenHoangPhuc_RandomNumber,0,4,2025-10-17 16:32:37.502,0
5,,"[123, 34, 110, 117, 109, 98, 101, 114, 34, 58,...",LeNguyenHoangPhuc_RandomNumber,0,5,2025-10-17 16:32:42.505,0
6,,"[123, 34, 110, 117, 109, 98, 101, 114, 34, 58,...",LeNguyenHoangPhuc_RandomNumber,0,6,2025-10-17 16:32:47.509,0
7,,"[123, 34, 110, 117, 109, 98, 101, 114, 34, 58,...",LeNguyenHoangPhuc_RandomNumber,0,7,2025-10-17 16:32:52.579,0
8,,"[123, 34, 110, 117, 109, 98, 101, 114, 34, 58,...",LeNguyenHoangPhuc_RandomNumber,0,8,2025-10-17 16:32:57.599,0
9,,"[123, 34, 110, 117, 109, 98, 101, 114, 34, 58,...",LeNguyenHoangPhuc_RandomNumber,0,9,2025-10-17 16:33:02.659,0


In [5]:
from pyspark.sql.functions import col

# Kiểm tra schema trước
kafkaDf.printSchema()


root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [6]:
batchDF = kafkaDf.select(col('topic'),col('offset'),col('value').cast('string').substr(12,1).alias('rand_number'))

from time import sleep
from IPython.display import display, clear_output

try:
    for x in range(0,2000):
        print("Showing live view refreshed every 5 seconds")
        print(f"Seconds passed: {x*5}")
        display(batchDF.toPandas())
        sleep(5)
        clear_output(wait=True)
except KeyboardInterrupt:
    print("break")
print("Live view ended...")


Showing live view refreshed every 5 seconds
Seconds passed: 10


Unnamed: 0,topic,offset,rand_number
0,LeNguyenHoangPhuc_RandomNumber,0,0
1,LeNguyenHoangPhuc_RandomNumber,1,1
2,LeNguyenHoangPhuc_RandomNumber,2,2
3,LeNguyenHoangPhuc_RandomNumber,3,3
4,LeNguyenHoangPhuc_RandomNumber,4,4
5,LeNguyenHoangPhuc_RandomNumber,5,5
6,LeNguyenHoangPhuc_RandomNumber,6,6
7,LeNguyenHoangPhuc_RandomNumber,7,7
8,LeNguyenHoangPhuc_RandomNumber,8,8
9,LeNguyenHoangPhuc_RandomNumber,9,9


break
Live view ended...


In [7]:
batchCountDF = batchDF.groupBy('rand_number').count()

for x in range(0,2000):
    try:
        print("Showing live view refreshed every 5 seconds")
        print(f"Seconds passed: {x*5}")
        display(batchCountDF.toPandas())
        sleep(5)
        clear_output(wait=True)
    except KeyboardInterrupt:
        print("break")
        break
print("Live view ended...")

Showing live view refreshed every 5 seconds
Seconds passed: 40


Unnamed: 0,rand_number,count
0,7,1
1,3,5
2,8,1
3,0,1
4,5,1
5,6,1
6,9,1
7,1,11
8,4,1
9,2,11


break
Live view ended...
