In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession

scala_version = '2.12'
spark_version = '3.5.7'  # Đảm bảo đúng phiên bản Spark của bạn

packages = f"org.apache.spark:spark-sql-kafka-0-10_{scala_version}:{spark_version},org.apache.kafka:kafka-clients:3.5.0"

spark = SparkSession.builder \
    .appName("CNN and Kafka") \
    .config("spark.jars.packages", packages) \
    .config("spark.ui.showConsoleProgress", "false") \
    .master("local[*]") \
    .getOrCreate()

spark

:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
org.apache.kafka#kafka-clients added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-c1c4119f-73e3-4f92-9348-5486b690d99f;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.7 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found org.xerial.snappy#snappy-java;1.1.10.5 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in central
	found org.apache.kafka#kafka-clients;3.5.0 in central
	found com.github.luben#zstd-jni;1.5.5-1 in central
	found org.l

In [2]:
from pyspark.sql.types import ArrayType, FloatType
# Các hằng số từ notebook huấn luyện của bạn
MODEL_PATH = 'cnn_multi_aspect_model.h5'
TOKENIZER_PATH = 'tokenizer.pickle'
MAX_SEQUENCE_LENGTH = 100

ASPECT_COLUMNS = ['Price', 'Shipping', 'Outlook', 'Quality', 'Size', 'Shop_Service', 'General', 'Others']

# Ánh xạ ngược từ index (0-3) về nhãn gốc (-1, 0, 1, 2)
label_map = {-1: 0, 0: 1, 1: 2, 2: 3}
inverse_label_map = {v: k for k, v in label_map.items()}

# Broadcast đường dẫn file để các executor có thể thấy
sc = spark.sparkContext
broadcasted_model_path = sc.broadcast(MODEL_PATH)
broadcasted_tokenizer_path = sc.broadcast(TOKENIZER_PATH)

# Schema cho đầu ra của UDF: một mảng chứa 8 mảng con (mỗi mảng con 4 xác suất)
schema_output = ArrayType(ArrayType(FloatType()))

In [3]:
from typing import Iterator
from pandas import Series
from pyspark.sql.functions import pandas_udf
@pandas_udf(schema_output)
def predict_sentiments_udf(iterator: Iterator[Series]) -> Iterator[Series]:
    from tensorflow.keras.models import load_model
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    from pickle import load
    from re import sub
    from os import path

    model_path = broadcasted_model_path.value
    tokenizer_path = broadcasted_tokenizer_path.value
    
    if not path.exists(model_path) or not path.exists(tokenizer_path):
        raise FileNotFoundError(f"Model/Tokenizer không tìm thấy trên worker. Đảm bảo {model_path} và {tokenizer_path} có thể truy cập được.")
        
    model = load_model(model_path)
    with open(tokenizer_path, 'rb') as handle:
        tokenizer = load(handle)
    
    def clean_text_udf(text):
        text = str(text).lower()
        text = sub(r'[^\w\s]', '', text)
        text = sub(r'\d+', '', text)
        return text.strip()

    for comments_batch in iterator:
        cleaned_comments = comments_batch.apply(clean_text_udf)
        
        sequences = tokenizer.texts_to_sequences(cleaned_comments)
        padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
        
        if len(padded_sequences) > 0:
            predictions = model.predict(padded_sequences, verbose=0)
            result = [list(map(lambda x: x.tolist(), p)) for p in zip(*predictions)]
        else:
            result = []

        # 5. Trả về batch kết quả
        yield Series(result)

In [4]:
from pyspark.sql.types import StructField, StringType, IntegerType, StructType
json_fields = [StructField("review_text", StringType())]
for aspect in ASPECT_COLUMNS:
    json_fields.append(StructField(aspect, IntegerType(), True)) # True = nullable
json_schema = StructType(json_fields)

In [5]:
KAFKA_SERVER = "kafka:9092"
TOPIC_NAME = "review_stream"

kafka_df = spark.readStream.format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_SERVER) \
    .option("subscribe", TOPIC_NAME) \
    .load()

In [6]:
from kafka import KafkaAdminClient
admin = KafkaAdminClient(bootstrap_servers="kafka:9092")
print(admin.list_topics())

['review_stream']


In [7]:
# 2. Parse JSON và lấy các cột
from pyspark.sql.functions import col, from_json
parsed_df = kafka_df.select(
    col("value").cast("string").alias("json_value")
).select(
    from_json(col("json_value"), json_schema).alias("data")
).select("data.*") 

In [8]:
# 3. Áp dụng Pandas UDF để dự đoán (chỉ cần 'review_text')
predictions_df = parsed_df.withColumn(
    "predictions_prob",
    predict_sentiments_udf(col("review_text"))
)

In [9]:
from pyspark.sql.functions import udf
from numpy import argmax
# 4. Giải nén dự đoán và giữ lại nhãn thật
# Bắt đầu với dataframe chứa nhãn thật và xác suất dự đoán
result_df = predictions_df # predictions_df giờ đã chứa cả cột nhãn thật và predictions_prob

# UDF để map ngược index (0, 1, 2, 3) về nhãn (-1, 0, 1, 2)
udf_inverser = udf(lambda idx: inverse_label_map.get(idx, -99), IntegerType())

for i, aspect in enumerate(ASPECT_COLUMNS):
    # UDF để lấy index có xác suất cao nhất
    udf_extractor = udf(lambda prob_array: int(argmax(prob_array[i])), IntegerType())

    # Lấy ra index dự đoán (0-3)
    result_df = result_df.withColumn(
        f"pred_idx_{aspect}",
        udf_extractor(col("predictions_prob"))
    )
    # Ánh xạ ngược index về nhãn gốc (-1, 0, 1, 2)
    result_df = result_df.withColumn(
        f"pred_{aspect}",
        udf_inverser(col(f"pred_idx_{aspect}"))
    ).drop(f"pred_idx_{aspect}") # Xóa cột index trung gian

# Xóa cột xác suất không cần thiết nữa
result_df = result_df.drop("predictions_prob")
print("Đã định nghĩa luồng xử lý (bao gồm nhãn thật).")

Đã định nghĩa luồng xử lý (bao gồm nhãn thật).


In [10]:
# Chọn các cột cuối cùng để hiển thị
final_output_df = result_df.select(
    "review_text",
    "pred_Quality",
    "pred_Price",
    "pred_Shipping",
    "pred_Shop_Service",
    "pred_Size",
    "pred_Outlook",
    "pred_General",
    "pred_Others"
)

# Chạy stream và hiển thị ra console
query = final_output_df.writeStream \
    .outputMode("append") \
    .format("console") \
    .option("truncate", "false") \
    .start()

print("Query đã bắt đầu. Đang chờ dữ liệu từ Kafka...")
print("Chạy cell producer 'sendStream_reviews.ipynb' để gửi dữ liệu.")
print("Nhấn Interrupt Kernel (nút Stop) để dừng stream.")

try:
    query.awaitTermination()
except KeyboardInterrupt:
    print("Đang dừng query...")
    query.stop()
    print("Query đã dừng.")

25/10/26 03:14:34 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-57d4fcd0-57a3-4ed1-9478-1bec92da2f04. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/10/26 03:14:34 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


Query đã bắt đầu. Đang chờ dữ liệu từ Kafka...
Chạy cell producer 'sendStream_reviews.ipynb' để gửi dữ liệu.
Nhấn Interrupt Kernel (nút Stop) để dừng stream.
-------------------------------------------
Batch: 0
-------------------------------------------
+-----------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|review_text|pred_Quality|pred_Price|pred_Shipping|pred_Shop_Service|pred_Size|pred_Outlook|pred_General|pred_Others|
+-----------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
+-----------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+



25/10/26 03:14:45 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:14:46 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:14:46 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:14:46 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
2025-10-26 03:14:50.314057: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-10-26 03:14:50.338990: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightl

-------------------------------------------
Batch: 1
-------------------------------------------
+------------------------------------------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|review_text                                                                                                                                     |pred_Quality|pred_Price|pred_Shipping|pred_Shop_Service|pred_Size|pred_Outlook|pred_General|pred_Others|
+------------------------------------------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|Giày đẹp đi vừa chân đóng hàng cẩn thận giao nhanh                                                                                              |-1  

25/10/26 03:15:06 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:06 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:06 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:06 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:06 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:06 WARN KafkaDataConsumer: KafkaDataConsumer is not running in Un

-------------------------------------------
Batch: 2
-------------------------------------------
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------

25/10/26 03:15:09 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:09 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:09 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


-------------------------------------------
Batch: 3
-------------------------------------------
+---------------------------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|review_text                                                                                                                      |pred_Quality|pred_Price|pred_Shipping|pred_Shop_Service|pred_Size|pred_Outlook|pred_General|pred_Others|
+---------------------------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|giày ổn giao hàng quá chậm,nhưng shipper nhiệt tình thái độ làm vc quá tệ                                                        |-1          |-1        |1            |-1               |-1      

25/10/26 03:15:12 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:12 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:12 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:12 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


-------------------------------------------
Batch: 4
-------------------------------------------
+----------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|review_text                                                                                                     |pred_Quality|pred_Price|pred_Shipping|pred_Shop_Service|pred_Size|pred_Outlook|pred_General|pred_Others|
+----------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|Tốt hình ảnh mang tính chất nhận xu                                                                             |-1          |-1        |-1           |-1               |-1       |-1          |-1          |-1         |
|Đã nhận được hàng đúng như

25/10/26 03:15:15 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:15 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:15 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


-------------------------------------------
Batch: 5
-------------------------------------------
+-------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|review_text                                                                                |pred_Quality|pred_Price|pred_Shipping|pred_Shop_Service|pred_Size|pred_Outlook|pred_General|pred_Others|
+-------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|Dep mua mang danh cau hay mang thể duc cungg ô sờ kê mua dc giá 145                        |-1          |-1        |-1           |-1               |-1       |1           |1           |-1         |
|Hàng thì xấu phom giày thì dở chả được cái mẹ gì                                           |-1          |-1   

25/10/26 03:15:18 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:18 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:18 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


-------------------------------------------
Batch: 6
-------------------------------------------
+-----------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|review_text                                                                                                      |pred_Quality|pred_Price|pred_Shipping|pred_Shop_Service|pred_Size|pred_Outlook|pred_General|pred_Others|
+-----------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|Giày siêu đẹp luôn ạ có hơi bẩn 1 tí ở đế giày nhưng giặt chắc sẽ hết, giao hàng khá nhanh, sẽ ủng hộ shop tiếp  |-1          |-1        |1            |-1               |-1       |1           |-1          |-1         |
|Giày mới thì mang vài 

25/10/26 03:15:21 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:21 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


-------------------------------------------
Batch: 7
-------------------------------------------
+--------------------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|review_text                                                                                                               |pred_Quality|pred_Price|pred_Shipping|pred_Shop_Service|pred_Size|pred_Outlook|pred_General|pred_Others|
+--------------------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|Phong cách smiley ấn tượng: hình in pop art, áo hoodie thoải mái và denim vừa vặn. Xu hướng mix đồ ấn tượng mới đã ra mắt.|-1          |-1        |-1           |-1               |1        |-1          |-1          |

25/10/26 03:15:23 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:23 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


-------------------------------------------
Batch: 8
-------------------------------------------
+-----------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|review_text                                                |pred_Quality|pred_Price|pred_Shipping|pred_Shop_Service|pred_Size|pred_Outlook|pred_General|pred_Others|
+-----------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|Giai hàng nhanh  Đúng form Sản phẩm tốt  Sẽ tiếp tục ủng hộ|-1          |-1        |1            |-1               |-1       |1           |-1          |-1         |
|Mặt hàng rất oke nha                                       |-1          |-1        |-1           |-1               |-1       |-1          |1           |-1         |
+-----------------------------------------------------------+------------

25/10/26 03:15:25 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:25 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:25 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


-------------------------------------------
Batch: 9
-------------------------------------------
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|review_text                                                                                                                                                                                                                     |pred_Quality|pred_Price|pred_Shipping|pred_Shop_Service|pred_Size|pred_Outlook|pred_General|pred_Others|
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+--

25/10/26 03:15:28 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:28 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


-------------------------------------------
Batch: 10
-------------------------------------------
+------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|review_text                                                             |pred_Quality|pred_Price|pred_Shipping|pred_Shop_Service|pred_Size|pred_Outlook|pred_General|pred_Others|
+------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|Màu sắc: đen Chất liệu đúng với mô tả Shop gio hàng cần thận, có kèm tất|1           |-1        |-1           |-1               |-1       |1           |-1          |-1         |
|Rất bẩn, kbiet có giặt sạch k                                           |-1          |-1        |-1           |-1               |-1       |1           |-1          |-1         |
+------

25/10/26 03:15:29 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:29 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


-------------------------------------------
Batch: 11
-------------------------------------------
+------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|review_text                                                 |pred_Quality|pred_Price|pred_Shipping|pred_Shop_Service|pred_Size|pred_Outlook|pred_General|pred_Others|
+------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|Mua hai đoi giao một đôi trên đơn hàng có quà tặng mà k thấy|-1          |-1        |-1           |-1               |-1       |-1          |-1          |-1         |
|Với 83k thì đôi giày này là quá ổn. Vừa from đẹp, ưng ý nhé |-1          |-1        |-1           |-1               |-1       |1           |-1          |-1         |
+------------------------------------------------------------+-----

25/10/26 03:15:33 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:33 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:33 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


-------------------------------------------
Batch: 12
-------------------------------------------
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|review_text                                                                                                                                                                                 |pred_Quality|pred_Price|pred_Shipping|pred_Shop_Service|pred_Size|pred_Outlook|pred_General|pred_Others|
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|Công nhận giày c

25/10/26 03:15:35 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:35 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


-------------------------------------------
Batch: 13
-------------------------------------------
+---------------------------------------------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|review_text                                                                                                                                        |pred_Quality|pred_Price|pred_Shipping|pred_Shop_Service|pred_Size|pred_Outlook|pred_General|pred_Others|
+---------------------------------------------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|hàng về nhanh đặt 2 ngày đã về r Màu đúng,đẹp Đế cai 3cm Mk hay đi size 39 nhm mua về hơi rộng xíu Nói chung là hàng đẹp giao nhanh, nên mu

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


Đang dừng query...
Query đã dừng.


In [14]:
# Chọn các cột cuối cùng để ghi vào memory (bao gồm nhãn thật và dự đoán)
display_columns = ["review_text"]
for aspect in ASPECT_COLUMNS:
    display_columns.append(aspect) # Cột nhãn thật
    display_columns.append(f"pred_{aspect}") # Cột dự đoán

final_output_df_mem = result_df.select(*display_columns) # Dùng *

# Nếu bạn muốn hiển thị kết quả trong một bảng (table) mà bạn có thể query:
query_memory = final_output_df_mem.writeStream \
    .outputMode("append") \
    .format("memory") \
    .queryName("review_predictions_table") \
    .start()

print("Query (memory) đã bắt đầu. Chạy cell tiếp theo để xem kết quả và đánh giá.")

25/10/26 03:21:34 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-6fe8371f-6e4f-458c-b487-ca9df4dc5dce. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/10/26 03:21:34 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


Query (memory) đã bắt đầu. Chạy cell tiếp theo để xem kết quả và đánh giá.


25/10/26 03:21:36 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
2025-10-26 03:21:39.411627: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-10-26 03:21:39.413511: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-26 03:21:39.684622: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-10-26 03:21:44.188815: I tensorflow/c

In [15]:
from IPython.display import display, clear_output
import time

try:
    while True:
        clear_output(wait=True)
        print("Đang làm mới... (Nhấn Interrupt Kernel để dừng)")
        # Hiển thị bảng từ memory
        display(spark.sql("SELECT * FROM review_predictions_table").toPandas())
        time.sleep(5) # Làm mới sau mỗi 5 giây
except KeyboardInterrupt:
    print("Đã dừng hiển thị.")
    query_memory.stop()

Đang làm mới... (Nhấn Interrupt Kernel để dừng)


Unnamed: 0,review_text,Price,pred_Price,Shipping,pred_Shipping,Outlook,pred_Outlook,Quality,pred_Quality,Size,pred_Size,Shop_Service,pred_Shop_Service,General,pred_General,Others,pred_Others
0,"Shop đóng gói cẩn thận, giao hàng nhanh ❤❤ Già...",-1,-1,1,1,1,1,-1,-1,-1,-1,1,1,-1,-1,-1,-1
1,Hình ảnh mtc nhận xu thôi. Giao hàng hơi châm....,-1,-1,0,1,1,1,-1,-1,-1,-1,1,1,-1,-1,-1,-1
2,"10 ĐIỂM NHÁ, giày đóng gói giá ok lun cái hộp ...",-1,-1,-1,-1,-1,-1,1,-1,-1,-1,1,1,1,-1,-1,-1
3,Với giá thành này sp cũng đc cho là ổn,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,2,2,-1,-1
4,Tuyệttttttttttttttttttttttttttttttt,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,Ko mai choei is so good to hear you are so swe...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,2,2
196,Hang đẹp giống như mo ta,-1,-1,-1,-1,1,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
197,Giày giống như hình nhưg mà pị trầy vs lại dín...,-1,-1,-1,-1,0,1,-1,-1,-1,-1,1,-1,-1,-1,-1,-1
198,Sản phẩm OK tiền phù hợp với giá,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,1,2,-1,-1


Đã dừng hiển thị.


25/10/26 03:24:58 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 100, writer: org.apache.spark.sql.execution.streaming.sources.MemoryStreamingWrite@45996a90] is aborting.
25/10/26 03:24:58 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 100, writer: org.apache.spark.sql.execution.streaming.sources.MemoryStreamingWrite@45996a90] aborted.
25/10/26 03:24:59 ERROR Utils: Aborting task
org.apache.spark.TaskKilledException
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:597)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:594)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:99)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:75)
	at org.

In [16]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import col
import pandas as pd

print("--- Bắt đầu Đánh giá Mô hình trên Dữ liệu Đã Thu thập từ Stream ---")

# --- 1. Đọc dữ liệu dự đoán và nhãn thật từ memory sink ---
try:
    # Bảng này giờ đã chứa cả nhãn thật (vd: 'Price') và dự đoán (vd: 'pred_Price')
    eval_df = spark.sql("SELECT * FROM review_predictions_table")

    if eval_df.count() == 0:
        print("Chưa có dữ liệu trong bảng 'review_predictions_table'. Hãy đợi stream chạy.")
    else:
        print(f"Đã đọc {eval_df.count()} bản ghi từ memory sink để đánh giá.")

        # --- 2. Đánh giá từng khía cạnh ---
        evaluator_f1 = MulticlassClassificationEvaluator(metricName="f1")
        evaluator_accuracy = MulticlassClassificationEvaluator(metricName="accuracy")

        print("\n--- KẾT QUẢ ĐÁNH GIÁ ---")
        total_accuracy = 0
        total_f1 = 0
        valid_aspects = 0

        for aspect in ASPECT_COLUMNS:
            true_col = aspect
            pred_col = f"pred_{aspect}"

            # Chọn cột nhãn và dự đoán, đổi tên, bỏ null
            aspect_eval_df = eval_df.select(
                col(true_col).cast("double").alias("label"),
                col(pred_col).cast("double").alias("prediction")
            ).na.drop() # Rất quan trọng: Bỏ qua nếu nhãn thật là null (-99 hoặc None)

            count = aspect_eval_df.count()
            if count > 0:
                f1_score = evaluator_f1.evaluate(aspect_eval_df)
                accuracy = evaluator_accuracy.evaluate(aspect_eval_df)
                print(f"Khía cạnh: {aspect} ({count} bản ghi)")
                print(f"  Accuracy: {accuracy:.4f}")
                print(f"  F1-Score: {f1_score:.4f}")
                total_accuracy += accuracy
                total_f1 += f1_score
                valid_aspects += 1
            else:
                 print(f"Khía cạnh: {aspect} - Không có dữ liệu hợp lệ (non-null) để đánh giá.")

        # Tính trung bình nếu có khía cạnh hợp lệ
        if valid_aspects > 0:
            avg_accuracy = total_accuracy / valid_aspects
            avg_f1 = total_f1 / valid_aspects
            print("\n--- Trung bình ---")
            print(f"  Average Accuracy: {avg_accuracy:.4f}")
            print(f"  Average F1-Score: {avg_f1:.4f}")
        print("--------------------")


except Exception as e:
    print(f"Lỗi khi truy vấn hoặc đánh giá bảng 'review_predictions_table': {e}")
    print("Hãy đảm bảo query ghi vào memory sink đang chạy và đã xử lý dữ liệu.")

print("\n--- Đánh giá Hoàn tất ---")

--- Bắt đầu Đánh giá Mô hình trên Dữ liệu Đã Thu thập từ Stream ---
Đã đọc 200 bản ghi từ memory sink để đánh giá.

--- KẾT QUẢ ĐÁNH GIÁ ---
Khía cạnh: Price (200 bản ghi)
  Accuracy: 0.9200
  F1-Score: 0.9019
Khía cạnh: Shipping (200 bản ghi)
  Accuracy: 0.9500
  F1-Score: 0.9377
Khía cạnh: Outlook (200 bản ghi)
  Accuracy: 0.8700
  F1-Score: 0.8407
Khía cạnh: Quality (200 bản ghi)
  Accuracy: 0.8450
  F1-Score: 0.8163
Khía cạnh: Size (200 bản ghi)
  Accuracy: 0.9400
  F1-Score: 0.9411
Khía cạnh: Shop_Service (200 bản ghi)
  Accuracy: 0.9200
  F1-Score: 0.9098
Khía cạnh: General (200 bản ghi)
  Accuracy: 0.8600
  F1-Score: 0.8357
Khía cạnh: Others (200 bản ghi)
  Accuracy: 0.9350
  F1-Score: 0.9263

--- Trung bình ---
  Average Accuracy: 0.9050
  Average F1-Score: 0.8887
--------------------

--- Đánh giá Hoàn tất ---
