In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession

scala_version = '2.12'
spark_version = '3.5.7'  # ƒê·∫£m b·∫£o ƒë√∫ng phi√™n b·∫£n Spark c·ªßa b·∫°n

packages = f"org.apache.spark:spark-sql-kafka-0-10_{scala_version}:{spark_version},org.apache.kafka:kafka-clients:3.5.0"

spark = SparkSession.builder \
    .appName("CNN and Kafka") \
    .config("spark.jars.packages", packages) \
    .config("spark.ui.showConsoleProgress", "false") \
    .master("local[*]") \
    .getOrCreate()

spark

:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
org.apache.kafka#kafka-clients added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-c1c4119f-73e3-4f92-9348-5486b690d99f;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.7 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found org.xerial.snappy#snappy-java;1.1.10.5 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in central
	found org.apache.kafka#kafka-clients;3.5.0 in central
	found com.github.luben#zstd-jni;1.5.5-1 in central
	found org.l

In [2]:
from pyspark.sql.types import ArrayType, FloatType
# C√°c h·∫±ng s·ªë t·ª´ notebook hu·∫•n luy·ªán c·ªßa b·∫°n
MODEL_PATH = 'cnn_multi_aspect_model.h5'
TOKENIZER_PATH = 'tokenizer.pickle'
MAX_SEQUENCE_LENGTH = 100

ASPECT_COLUMNS = ['Price', 'Shipping', 'Outlook', 'Quality', 'Size', 'Shop_Service', 'General', 'Others']

# √Ånh x·∫° ng∆∞·ª£c t·ª´ index (0-3) v·ªÅ nh√£n g·ªëc (-1, 0, 1, 2)
label_map = {-1: 0, 0: 1, 1: 2, 2: 3}
inverse_label_map = {v: k for k, v in label_map.items()}

# Broadcast ƒë∆∞·ªùng d·∫´n file ƒë·ªÉ c√°c executor c√≥ th·ªÉ th·∫•y
sc = spark.sparkContext
broadcasted_model_path = sc.broadcast(MODEL_PATH)
broadcasted_tokenizer_path = sc.broadcast(TOKENIZER_PATH)

# Schema cho ƒë·∫ßu ra c·ªßa UDF: m·ªôt m·∫£ng ch·ª©a 8 m·∫£ng con (m·ªói m·∫£ng con 4 x√°c su·∫•t)
schema_output = ArrayType(ArrayType(FloatType()))

In [3]:
from typing import Iterator
from pandas import Series
from pyspark.sql.functions import pandas_udf
@pandas_udf(schema_output)
def predict_sentiments_udf(iterator: Iterator[Series]) -> Iterator[Series]:
    from tensorflow.keras.models import load_model
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    from pickle import load
    from re import sub
    from os import path

    model_path = broadcasted_model_path.value
    tokenizer_path = broadcasted_tokenizer_path.value
    
    if not path.exists(model_path) or not path.exists(tokenizer_path):
        raise FileNotFoundError(f"Model/Tokenizer kh√¥ng t√¨m th·∫•y tr√™n worker. ƒê·∫£m b·∫£o {model_path} v√† {tokenizer_path} c√≥ th·ªÉ truy c·∫≠p ƒë∆∞·ª£c.")
        
    model = load_model(model_path)
    with open(tokenizer_path, 'rb') as handle:
        tokenizer = load(handle)
    
    def clean_text_udf(text):
        text = str(text).lower()
        text = sub(r'[^\w\s]', '', text)
        text = sub(r'\d+', '', text)
        return text.strip()

    for comments_batch in iterator:
        cleaned_comments = comments_batch.apply(clean_text_udf)
        
        sequences = tokenizer.texts_to_sequences(cleaned_comments)
        padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
        
        if len(padded_sequences) > 0:
            predictions = model.predict(padded_sequences, verbose=0)
            result = [list(map(lambda x: x.tolist(), p)) for p in zip(*predictions)]
        else:
            result = []

        # 5. Tr·∫£ v·ªÅ batch k·∫øt qu·∫£
        yield Series(result)

In [4]:
from pyspark.sql.types import StructField, StringType, IntegerType, StructType
json_fields = [StructField("review_text", StringType())]
for aspect in ASPECT_COLUMNS:
    json_fields.append(StructField(aspect, IntegerType(), True)) # True = nullable
json_schema = StructType(json_fields)

In [5]:
KAFKA_SERVER = "kafka:9092"
TOPIC_NAME = "review_stream"

kafka_df = spark.readStream.format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_SERVER) \
    .option("subscribe", TOPIC_NAME) \
    .load()

In [6]:
from kafka import KafkaAdminClient
admin = KafkaAdminClient(bootstrap_servers="kafka:9092")
print(admin.list_topics())

['review_stream']


In [7]:
# 2. Parse JSON v√† l·∫•y c√°c c·ªôt
from pyspark.sql.functions import col, from_json
parsed_df = kafka_df.select(
    col("value").cast("string").alias("json_value")
).select(
    from_json(col("json_value"), json_schema).alias("data")
).select("data.*") 

In [8]:
# 3. √Åp d·ª•ng Pandas UDF ƒë·ªÉ d·ª± ƒëo√°n (ch·ªâ c·∫ßn 'review_text')
predictions_df = parsed_df.withColumn(
    "predictions_prob",
    predict_sentiments_udf(col("review_text"))
)

In [9]:
from pyspark.sql.functions import udf
from numpy import argmax
# 4. Gi·∫£i n√©n d·ª± ƒëo√°n v√† gi·ªØ l·∫°i nh√£n th·∫≠t
# B·∫Øt ƒë·∫ßu v·ªõi dataframe ch·ª©a nh√£n th·∫≠t v√† x√°c su·∫•t d·ª± ƒëo√°n
result_df = predictions_df # predictions_df gi·ªù ƒë√£ ch·ª©a c·∫£ c·ªôt nh√£n th·∫≠t v√† predictions_prob

# UDF ƒë·ªÉ map ng∆∞·ª£c index (0, 1, 2, 3) v·ªÅ nh√£n (-1, 0, 1, 2)
udf_inverser = udf(lambda idx: inverse_label_map.get(idx, -99), IntegerType())

for i, aspect in enumerate(ASPECT_COLUMNS):
    # UDF ƒë·ªÉ l·∫•y index c√≥ x√°c su·∫•t cao nh·∫•t
    udf_extractor = udf(lambda prob_array: int(argmax(prob_array[i])), IntegerType())

    # L·∫•y ra index d·ª± ƒëo√°n (0-3)
    result_df = result_df.withColumn(
        f"pred_idx_{aspect}",
        udf_extractor(col("predictions_prob"))
    )
    # √Ånh x·∫° ng∆∞·ª£c index v·ªÅ nh√£n g·ªëc (-1, 0, 1, 2)
    result_df = result_df.withColumn(
        f"pred_{aspect}",
        udf_inverser(col(f"pred_idx_{aspect}"))
    ).drop(f"pred_idx_{aspect}") # X√≥a c·ªôt index trung gian

# X√≥a c·ªôt x√°c su·∫•t kh√¥ng c·∫ßn thi·∫øt n·ªØa
result_df = result_df.drop("predictions_prob")
print("ƒê√£ ƒë·ªãnh nghƒ©a lu·ªìng x·ª≠ l√Ω (bao g·ªìm nh√£n th·∫≠t).")

ƒê√£ ƒë·ªãnh nghƒ©a lu·ªìng x·ª≠ l√Ω (bao g·ªìm nh√£n th·∫≠t).


In [10]:
# Ch·ªçn c√°c c·ªôt cu·ªëi c√πng ƒë·ªÉ hi·ªÉn th·ªã
final_output_df = result_df.select(
    "review_text",
    "pred_Quality",
    "pred_Price",
    "pred_Shipping",
    "pred_Shop_Service",
    "pred_Size",
    "pred_Outlook",
    "pred_General",
    "pred_Others"
)

# Ch·∫°y stream v√† hi·ªÉn th·ªã ra console
query = final_output_df.writeStream \
    .outputMode("append") \
    .format("console") \
    .option("truncate", "false") \
    .start()

print("Query ƒë√£ b·∫Øt ƒë·∫ßu. ƒêang ch·ªù d·ªØ li·ªáu t·ª´ Kafka...")
print("Ch·∫°y cell producer 'sendStream_reviews.ipynb' ƒë·ªÉ g·ª≠i d·ªØ li·ªáu.")
print("Nh·∫•n Interrupt Kernel (n√∫t Stop) ƒë·ªÉ d·ª´ng stream.")

try:
    query.awaitTermination()
except KeyboardInterrupt:
    print("ƒêang d·ª´ng query...")
    query.stop()
    print("Query ƒë√£ d·ª´ng.")

25/10/26 03:14:34 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-57d4fcd0-57a3-4ed1-9478-1bec92da2f04. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/10/26 03:14:34 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


Query ƒë√£ b·∫Øt ƒë·∫ßu. ƒêang ch·ªù d·ªØ li·ªáu t·ª´ Kafka...
Ch·∫°y cell producer 'sendStream_reviews.ipynb' ƒë·ªÉ g·ª≠i d·ªØ li·ªáu.
Nh·∫•n Interrupt Kernel (n√∫t Stop) ƒë·ªÉ d·ª´ng stream.
-------------------------------------------
Batch: 0
-------------------------------------------
+-----------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|review_text|pred_Quality|pred_Price|pred_Shipping|pred_Shop_Service|pred_Size|pred_Outlook|pred_General|pred_Others|
+-----------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
+-----------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+



25/10/26 03:14:45 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:14:46 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:14:46 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:14:46 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
2025-10-26 03:14:50.314057: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-10-26 03:14:50.338990: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightl

-------------------------------------------
Batch: 1
-------------------------------------------
+------------------------------------------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|review_text                                                                                                                                     |pred_Quality|pred_Price|pred_Shipping|pred_Shop_Service|pred_Size|pred_Outlook|pred_General|pred_Others|
+------------------------------------------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|Gi√†y ƒë·∫πp ƒëi v·ª´a ch√¢n ƒë√≥ng h√†ng c·∫©n th·∫≠n giao nhanh                                                                                    

25/10/26 03:15:06 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:06 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:06 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:06 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:06 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:06 WARN KafkaDataConsumer: KafkaDataConsumer is not running in Un

-------------------------------------------
Batch: 2
-------------------------------------------
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------

25/10/26 03:15:09 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:09 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:09 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


-------------------------------------------
Batch: 3
-------------------------------------------
+---------------------------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|review_text                                                                                                                      |pred_Quality|pred_Price|pred_Shipping|pred_Shop_Service|pred_Size|pred_Outlook|pred_General|pred_Others|
+---------------------------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|gi√†y ·ªïn giao h√†ng qu√° ch·∫≠m,nh∆∞ng shipper nhi·ªát t√¨nh th√°i ƒë·ªô l√†m vc qu√° t·ªá                                                        |-1          |-1        |1            |-1     

25/10/26 03:15:12 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:12 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:12 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:12 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


-------------------------------------------
Batch: 4
-------------------------------------------
+----------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|review_text                                                                                                     |pred_Quality|pred_Price|pred_Shipping|pred_Shop_Service|pred_Size|pred_Outlook|pred_General|pred_Others|
+----------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|T·ªët h√¨nh ·∫£nh mang t√≠nh ch·∫•t nh·∫≠n xu                                                                             |-1          |-1        |-1           |-1               |-1       |-1          |-1          |-1         |
|ƒê√£ nh·∫≠n ƒë∆∞

25/10/26 03:15:15 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:15 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:15 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


-------------------------------------------
Batch: 5
-------------------------------------------
+-------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|review_text                                                                                |pred_Quality|pred_Price|pred_Shipping|pred_Shop_Service|pred_Size|pred_Outlook|pred_General|pred_Others|
+-------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|Dep mua mang danh cau hay mang th·ªÉ duc cungg √¥ s·ªù k√™ mua dc gi√° 145                        |-1          |-1        |-1           |-1               |-1       |1           |1           |-1         |
|H√†ng th√¨ x·∫•u phom gi√†y th√¨ d·ªü ch·∫£ ƒë∆∞·ª£c c√°i m·∫π g√¨                                     

25/10/26 03:15:18 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:18 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:18 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


-------------------------------------------
Batch: 6
-------------------------------------------
+-----------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|review_text                                                                                                      |pred_Quality|pred_Price|pred_Shipping|pred_Shop_Service|pred_Size|pred_Outlook|pred_General|pred_Others|
+-----------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|Gi√†y si√™u ƒë·∫πp lu√¥n ·∫° c√≥ h∆°i b·∫©n 1 t√≠ ·ªü ƒë·∫ø gi√†y nh∆∞ng gi·∫∑t ch·∫Øc s·∫Ω h·∫øt, giao h√†ng kh√° nhanh, s·∫Ω ·ªßng h·ªô shop ti·∫øp  |-1          |-1        |1            |-1               |-1       |1           |-1         

25/10/26 03:15:21 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:21 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


-------------------------------------------
Batch: 7
-------------------------------------------
+--------------------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|review_text                                                                                                               |pred_Quality|pred_Price|pred_Shipping|pred_Shop_Service|pred_Size|pred_Outlook|pred_General|pred_Others|
+--------------------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|Phong c√°ch smiley ·∫•n t∆∞·ª£ng: h√¨nh in pop art, √°o hoodie tho·∫£i m√°i v√† denim v·ª´a v·∫∑n. Xu h∆∞·ªõng mix ƒë·ªì ·∫•n t∆∞·ª£ng m·ªõi ƒë√£ ra m·∫Øt.|-1          |-1        |-1           |-1               |1  

25/10/26 03:15:23 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:23 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


-------------------------------------------
Batch: 8
-------------------------------------------
+-----------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|review_text                                                |pred_Quality|pred_Price|pred_Shipping|pred_Shop_Service|pred_Size|pred_Outlook|pred_General|pred_Others|
+-----------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|Giai h√†ng nhanh  ƒê√∫ng form S·∫£n ph·∫©m t·ªët  S·∫Ω ti·∫øp t·ª•c ·ªßng h·ªô|-1          |-1        |1            |-1               |-1       |1           |-1          |-1         |
|M·∫∑t h√†ng r·∫•t oke nha                                       |-1          |-1        |-1           |-1               |-1       |-1          |1           |-1         |
+------------------------------------------------

25/10/26 03:15:25 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:25 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:25 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


-------------------------------------------
Batch: 9
-------------------------------------------
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|review_text                                                                                                                                                                                                                     |pred_Quality|pred_Price|pred_Shipping|pred_Shop_Service|pred_Size|pred_Outlook|pred_General|pred_Others|
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+--

25/10/26 03:15:28 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:28 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


-------------------------------------------
Batch: 10
-------------------------------------------
+------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|review_text                                                             |pred_Quality|pred_Price|pred_Shipping|pred_Shop_Service|pred_Size|pred_Outlook|pred_General|pred_Others|
+------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|M√†u s·∫Øc: ƒëen Ch·∫•t li·ªáu ƒë√∫ng v·ªõi m√¥ t·∫£ Shop gio h√†ng c·∫ßn th·∫≠n, c√≥ k√®m t·∫•t|1           |-1        |-1           |-1               |-1       |1           |-1          |-1         |
|R·∫•t b·∫©n, kbiet c√≥ gi·∫∑t s·∫°ch k                                           |-1          |-1        |-1           |-1               |-1       |1           |

25/10/26 03:15:29 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:29 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


-------------------------------------------
Batch: 11
-------------------------------------------
+------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|review_text                                                 |pred_Quality|pred_Price|pred_Shipping|pred_Shop_Service|pred_Size|pred_Outlook|pred_General|pred_Others|
+------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|Mua hai ƒëoi giao m·ªôt ƒë√¥i tr√™n ƒë∆°n h√†ng c√≥ qu√† t·∫∑ng m√† k th·∫•y|-1          |-1        |-1           |-1               |-1       |-1          |-1          |-1         |
|V·ªõi 83k th√¨ ƒë√¥i gi√†y n√†y l√† qu√° ·ªïn. V·ª´a from ƒë·∫πp, ∆∞ng √Ω nh√© |-1          |-1        |-1           |-1               |-1       |1           |-1          |-1         |
+-------------------------------

25/10/26 03:15:33 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:33 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:33 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


-------------------------------------------
Batch: 12
-------------------------------------------
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|review_text                                                                                                                                                                                 |pred_Quality|pred_Price|pred_Shipping|pred_Shop_Service|pred_Size|pred_Outlook|pred_General|pred_Others|
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|C√¥ng nh·∫≠n gi√

25/10/26 03:15:35 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/26 03:15:35 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


-------------------------------------------
Batch: 13
-------------------------------------------
+---------------------------------------------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|review_text                                                                                                                                        |pred_Quality|pred_Price|pred_Shipping|pred_Shop_Service|pred_Size|pred_Outlook|pred_General|pred_Others|
+---------------------------------------------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|h√†ng v·ªÅ nhanh ƒë·∫∑t 2 ng√†y ƒë√£ v·ªÅ r M√†u ƒë√∫ng,ƒë·∫πp ƒê·∫ø cai 3cm Mk hay ƒëi size 39 nhm mua v·ªÅ h∆°i r·ªông x√≠u N√≥i chung l√

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


ƒêang d·ª´ng query...
Query ƒë√£ d·ª´ng.


In [14]:
# Ch·ªçn c√°c c·ªôt cu·ªëi c√πng ƒë·ªÉ ghi v√†o memory (bao g·ªìm nh√£n th·∫≠t v√† d·ª± ƒëo√°n)
display_columns = ["review_text"]
for aspect in ASPECT_COLUMNS:
    display_columns.append(aspect) # C·ªôt nh√£n th·∫≠t
    display_columns.append(f"pred_{aspect}") # C·ªôt d·ª± ƒëo√°n

final_output_df_mem = result_df.select(*display_columns) # D√πng *

# N·∫øu b·∫°n mu·ªën hi·ªÉn th·ªã k·∫øt qu·∫£ trong m·ªôt b·∫£ng (table) m√† b·∫°n c√≥ th·ªÉ query:
query_memory = final_output_df_mem.writeStream \
    .outputMode("append") \
    .format("memory") \
    .queryName("review_predictions_table") \
    .start()

print("Query (memory) ƒë√£ b·∫Øt ƒë·∫ßu. Ch·∫°y cell ti·∫øp theo ƒë·ªÉ xem k·∫øt qu·∫£ v√† ƒë√°nh gi√°.")

25/10/26 03:21:34 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-6fe8371f-6e4f-458c-b487-ca9df4dc5dce. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/10/26 03:21:34 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


Query (memory) ƒë√£ b·∫Øt ƒë·∫ßu. Ch·∫°y cell ti·∫øp theo ƒë·ªÉ xem k·∫øt qu·∫£ v√† ƒë√°nh gi√°.


25/10/26 03:21:36 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
2025-10-26 03:21:39.411627: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-10-26 03:21:39.413511: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-26 03:21:39.684622: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-10-26 03:21:44.188815: I tensorflow/c

In [15]:
from IPython.display import display, clear_output
import time

try:
    while True:
        clear_output(wait=True)
        print("ƒêang l√†m m·ªõi... (Nh·∫•n Interrupt Kernel ƒë·ªÉ d·ª´ng)")
        # Hi·ªÉn th·ªã b·∫£ng t·ª´ memory
        display(spark.sql("SELECT * FROM review_predictions_table").toPandas())
        time.sleep(5) # L√†m m·ªõi sau m·ªói 5 gi√¢y
except KeyboardInterrupt:
    print("ƒê√£ d·ª´ng hi·ªÉn th·ªã.")
    query_memory.stop()

ƒêang l√†m m·ªõi... (Nh·∫•n Interrupt Kernel ƒë·ªÉ d·ª´ng)


Unnamed: 0,review_text,Price,pred_Price,Shipping,pred_Shipping,Outlook,pred_Outlook,Quality,pred_Quality,Size,pred_Size,Shop_Service,pred_Shop_Service,General,pred_General,Others,pred_Others
0,"Shop ƒë√≥ng g√≥i c·∫©n th·∫≠n, giao h√†ng nhanh ‚ù§‚ù§ Gi√†...",-1,-1,1,1,1,1,-1,-1,-1,-1,1,1,-1,-1,-1,-1
1,H√¨nh ·∫£nh mtc nh·∫≠n xu th√¥i. Giao h√†ng h∆°i ch√¢m....,-1,-1,0,1,1,1,-1,-1,-1,-1,1,1,-1,-1,-1,-1
2,"10 ƒêI·ªÇM NH√Å, gi√†y ƒë√≥ng g√≥i gi√° ok lun c√°i h·ªôp ...",-1,-1,-1,-1,-1,-1,1,-1,-1,-1,1,1,1,-1,-1,-1
3,V·ªõi gi√° th√†nh n√†y sp c≈©ng ƒëc cho l√† ·ªïn,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,2,2,-1,-1
4,Tuy·ªáttttttttttttttttttttttttttttttt,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,Ko mai choei is so good to hear you are so swe...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,2,2
196,Hang ƒë·∫πp gi·ªëng nh∆∞ mo ta,-1,-1,-1,-1,1,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
197,Gi√†y gi·ªëng nh∆∞ h√¨nh nh∆∞g m√† p·ªã tr·∫ßy vs l·∫°i d√≠n...,-1,-1,-1,-1,0,1,-1,-1,-1,-1,1,-1,-1,-1,-1,-1
198,S·∫£n ph·∫©m OK ti·ªÅn ph√π h·ª£p v·ªõi gi√°,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,1,2,-1,-1


ƒê√£ d·ª´ng hi·ªÉn th·ªã.


25/10/26 03:24:58 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 100, writer: org.apache.spark.sql.execution.streaming.sources.MemoryStreamingWrite@45996a90] is aborting.
25/10/26 03:24:58 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 100, writer: org.apache.spark.sql.execution.streaming.sources.MemoryStreamingWrite@45996a90] aborted.
25/10/26 03:24:59 ERROR Utils: Aborting task
org.apache.spark.TaskKilledException
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:597)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:594)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:99)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:75)
	at org.

In [16]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import col
import pandas as pd

print("--- B·∫Øt ƒë·∫ßu ƒê√°nh gi√° M√¥ h√¨nh tr√™n D·ªØ li·ªáu ƒê√£ Thu th·∫≠p t·ª´ Stream ---")

# --- 1. ƒê·ªçc d·ªØ li·ªáu d·ª± ƒëo√°n v√† nh√£n th·∫≠t t·ª´ memory sink ---
try:
    # B·∫£ng n√†y gi·ªù ƒë√£ ch·ª©a c·∫£ nh√£n th·∫≠t (vd: 'Price') v√† d·ª± ƒëo√°n (vd: 'pred_Price')
    eval_df = spark.sql("SELECT * FROM review_predictions_table")

    if eval_df.count() == 0:
        print("Ch∆∞a c√≥ d·ªØ li·ªáu trong b·∫£ng 'review_predictions_table'. H√£y ƒë·ª£i stream ch·∫°y.")
    else:
        print(f"ƒê√£ ƒë·ªçc {eval_df.count()} b·∫£n ghi t·ª´ memory sink ƒë·ªÉ ƒë√°nh gi√°.")

        # --- 2. ƒê√°nh gi√° t·ª´ng kh√≠a c·∫°nh ---
        evaluator_f1 = MulticlassClassificationEvaluator(metricName="f1")
        evaluator_accuracy = MulticlassClassificationEvaluator(metricName="accuracy")

        print("\n--- K·∫æT QU·∫¢ ƒê√ÅNH GI√Å ---")
        total_accuracy = 0
        total_f1 = 0
        valid_aspects = 0

        for aspect in ASPECT_COLUMNS:
            true_col = aspect
            pred_col = f"pred_{aspect}"

            # Ch·ªçn c·ªôt nh√£n v√† d·ª± ƒëo√°n, ƒë·ªïi t√™n, b·ªè null
            aspect_eval_df = eval_df.select(
                col(true_col).cast("double").alias("label"),
                col(pred_col).cast("double").alias("prediction")
            ).na.drop() # R·∫•t quan tr·ªçng: B·ªè qua n·∫øu nh√£n th·∫≠t l√† null (-99 ho·∫∑c None)

            count = aspect_eval_df.count()
            if count > 0:
                f1_score = evaluator_f1.evaluate(aspect_eval_df)
                accuracy = evaluator_accuracy.evaluate(aspect_eval_df)
                print(f"Kh√≠a c·∫°nh: {aspect} ({count} b·∫£n ghi)")
                print(f"  Accuracy: {accuracy:.4f}")
                print(f"  F1-Score: {f1_score:.4f}")
                total_accuracy += accuracy
                total_f1 += f1_score
                valid_aspects += 1
            else:
                 print(f"Kh√≠a c·∫°nh: {aspect} - Kh√¥ng c√≥ d·ªØ li·ªáu h·ª£p l·ªá (non-null) ƒë·ªÉ ƒë√°nh gi√°.")

        # T√≠nh trung b√¨nh n·∫øu c√≥ kh√≠a c·∫°nh h·ª£p l·ªá
        if valid_aspects > 0:
            avg_accuracy = total_accuracy / valid_aspects
            avg_f1 = total_f1 / valid_aspects
            print("\n--- Trung b√¨nh ---")
            print(f"  Average Accuracy: {avg_accuracy:.4f}")
            print(f"  Average F1-Score: {avg_f1:.4f}")
        print("--------------------")


except Exception as e:
    print(f"L·ªói khi truy v·∫•n ho·∫∑c ƒë√°nh gi√° b·∫£ng 'review_predictions_table': {e}")
    print("H√£y ƒë·∫£m b·∫£o query ghi v√†o memory sink ƒëang ch·∫°y v√† ƒë√£ x·ª≠ l√Ω d·ªØ li·ªáu.")

print("\n--- ƒê√°nh gi√° Ho√†n t·∫•t ---")

--- B·∫Øt ƒë·∫ßu ƒê√°nh gi√° M√¥ h√¨nh tr√™n D·ªØ li·ªáu ƒê√£ Thu th·∫≠p t·ª´ Stream ---
ƒê√£ ƒë·ªçc 200 b·∫£n ghi t·ª´ memory sink ƒë·ªÉ ƒë√°nh gi√°.

--- K·∫æT QU·∫¢ ƒê√ÅNH GI√Å ---
Kh√≠a c·∫°nh: Price (200 b·∫£n ghi)
  Accuracy: 0.9200
  F1-Score: 0.9019
Kh√≠a c·∫°nh: Shipping (200 b·∫£n ghi)
  Accuracy: 0.9500
  F1-Score: 0.9377
Kh√≠a c·∫°nh: Outlook (200 b·∫£n ghi)
  Accuracy: 0.8700
  F1-Score: 0.8407
Kh√≠a c·∫°nh: Quality (200 b·∫£n ghi)
  Accuracy: 0.8450
  F1-Score: 0.8163
Kh√≠a c·∫°nh: Size (200 b·∫£n ghi)
  Accuracy: 0.9400
  F1-Score: 0.9411
Kh√≠a c·∫°nh: Shop_Service (200 b·∫£n ghi)
  Accuracy: 0.9200
  F1-Score: 0.9098
Kh√≠a c·∫°nh: General (200 b·∫£n ghi)
  Accuracy: 0.8600
  F1-Score: 0.8357
Kh√≠a c·∫°nh: Others (200 b·∫£n ghi)
  Accuracy: 0.9350
  F1-Score: 0.9263

--- Trung b√¨nh ---
  Average Accuracy: 0.9050
  Average F1-Score: 0.8887
--------------------

--- ƒê√°nh gi√° Ho√†n t·∫•t ---
