In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession

scala_version='2.12'
spark_version='3.5.1'

package = f"org.apache.spark:spark-sql-kafka-0-10_{scala_version}:{spark_version},org.apache.kafka:kafka-clients:3.5.0"


spark = SparkSession.builder \
    .appName("kafka-example") \
    .config("spark.jars.packages", package) \
    .config("spark.ui.showConsoleProgress", "false") \
    .getOrCreate()

spark

In [3]:
kafka_server = "kafka:9092" 
topic_name = "LeNguyenHoangPhuc_RandomNumber"         

kafkaDf = spark.read.format("kafka") \
    .option("kafka.bootstrap.servers", kafka_server) \
    .option("subscribe", topic_name) \
    .option("startingOffsets", "earliest") \
    .load()


In [4]:
kafkaDf.show()

+----+--------------------+--------------------+---------+------+--------------------+-------------+
| key|               value|               topic|partition|offset|           timestamp|timestampType|
+----+--------------------+--------------------+---------+------+--------------------+-------------+
|NULL|[7B 22 6E 75 6D 6...|LeNguyenHoangPhuc...|        0|     0|2025-10-19 03:36:...|            0|
|NULL|[7B 22 6E 75 6D 6...|LeNguyenHoangPhuc...|        0|     1|2025-10-19 03:36:...|            0|
|NULL|[7B 22 6E 75 6D 6...|LeNguyenHoangPhuc...|        0|     2|2025-10-19 03:36:...|            0|
|NULL|[7B 22 6E 75 6D 6...|LeNguyenHoangPhuc...|        0|     3|2025-10-19 03:36:...|            0|
|NULL|[7B 22 6E 75 6D 6...|LeNguyenHoangPhuc...|        0|     4|2025-10-19 03:36:...|            0|
|NULL|[7B 22 6E 75 6D 6...|LeNguyenHoangPhuc...|        0|     5|2025-10-19 03:36:...|            0|
|NULL|[7B 22 6E 75 6D 6...|LeNguyenHoangPhuc...|        0|     6|2025-10-19 03:36:...|     

In [5]:
kafkaDf.toPandas()

Unnamed: 0,key,value,topic,partition,offset,timestamp,timestampType
0,,"[123, 34, 110, 117, 109, 98, 101, 114, 34, 58,...",LeNguyenHoangPhuc_RandomNumber,0,0,2025-10-19 03:36:27.963,0
1,,"[123, 34, 110, 117, 109, 98, 101, 114, 34, 58,...",LeNguyenHoangPhuc_RandomNumber,0,1,2025-10-19 03:36:32.965,0
2,,"[123, 34, 110, 117, 109, 98, 101, 114, 34, 58,...",LeNguyenHoangPhuc_RandomNumber,0,2,2025-10-19 03:36:37.968,0
3,,"[123, 34, 110, 117, 109, 98, 101, 114, 34, 58,...",LeNguyenHoangPhuc_RandomNumber,0,3,2025-10-19 03:36:42.972,0
4,,"[123, 34, 110, 117, 109, 98, 101, 114, 34, 58,...",LeNguyenHoangPhuc_RandomNumber,0,4,2025-10-19 03:36:47.982,0
...,...,...,...,...,...,...,...
186,,"[123, 34, 110, 117, 109, 98, 101, 114, 34, 58,...",LeNguyenHoangPhuc_RandomNumber,0,186,2025-10-19 03:53:32.341,0
187,,"[123, 34, 110, 117, 109, 98, 101, 114, 34, 58,...",LeNguyenHoangPhuc_RandomNumber,0,187,2025-10-19 03:53:37.345,0
188,,"[123, 34, 110, 117, 109, 98, 101, 114, 34, 58,...",LeNguyenHoangPhuc_RandomNumber,0,188,2025-10-19 03:53:42.348,0
189,,"[123, 34, 110, 117, 109, 98, 101, 114, 34, 58,...",LeNguyenHoangPhuc_RandomNumber,0,189,2025-10-19 03:53:47.350,0


In [6]:
from pyspark.sql.functions import col

# Kiểm tra schema trước
kafkaDf.printSchema()


root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [7]:
batchDF = kafkaDf.select(col('topic'),col('offset'),col('value').cast('string').substr(12,1).alias('rand_number'))

from time import sleep
from IPython.display import display, clear_output
x = 0
try:
    while(True):
        print("Showing live view refreshed every 5 seconds")
        print(f"Seconds passed: {x*5}")
        display(batchDF.toPandas())
        sleep(5)
        x += 1
        clear_output(wait=True)
except KeyboardInterrupt:
    print("break")
print("Live view ended...")


Showing live view refreshed every 5 seconds
Seconds passed: 80


Unnamed: 0,topic,offset,rand_number
0,LeNguyenHoangPhuc_RandomNumber,0,0
1,LeNguyenHoangPhuc_RandomNumber,1,1
2,LeNguyenHoangPhuc_RandomNumber,2,2
3,LeNguyenHoangPhuc_RandomNumber,3,3
4,LeNguyenHoangPhuc_RandomNumber,4,4
...,...,...,...
205,LeNguyenHoangPhuc_RandomNumber,205,4
206,LeNguyenHoangPhuc_RandomNumber,206,4
207,LeNguyenHoangPhuc_RandomNumber,207,4
208,LeNguyenHoangPhuc_RandomNumber,208,4


break
Live view ended...


In [8]:
batchCountDF = batchDF.groupBy('rand_number').count()

for x in range(0,2000):
    try:
        print("Showing live view refreshed every 5 seconds")
        print(f"Seconds passed: {x*5}")
        display(batchCountDF.toPandas())
        sleep(5)
        clear_output(wait=True)
    except KeyboardInterrupt:
        print("break")
        break
print("Live view ended...")

Showing live view refreshed every 5 seconds
Seconds passed: 50


Unnamed: 0,rand_number,count
0,7,12
1,3,22
2,8,12
3,0,2
4,5,12
5,6,12
6,9,12
7,1,87
8,4,17
9,2,22


break
Live view ended...
