In [None]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession

scala_version = '2.12'
spark_version = '3.5.7' 
kafka_clients_version = '3.5.0'
SPARK_JARS_DIR="/usr/local/spark/jars/"
packages = [
    f"org.apache.spark:spark-sql-kafka-0-10_{scala_version}:{spark_version}",
    f"org.apache.kafka:kafka-clients:{kafka_clients_version}",
]

for pac in packages:
    pac = SPARK_JARS_DIR + pac

spark = SparkSession.builder \
    .appName("CNN and Kafka") \
    .config("spark.jars", ",".join(packages)) \
    .config("spark.ui.showConsoleProgress", "false") \
    .master("local[*]") \
    .getOrCreate()

spark

:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
org.apache.kafka#kafka-clients added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-195f2ca6-9696-4ae4-a82b-04cbebbf7c5c;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.7 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found org.xerial.snappy#snappy-java;1.1.10.5 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in central
	found org.apache.kafka#kafka-clients;3.5.0 in central
	found com.github.luben#zstd-jni;1.5.5-1 in central
	found org.l

In [2]:
from pyspark.sql.types import ArrayType, FloatType
# C√°c h·∫±ng s·ªë t·ª´ notebook hu·∫•n luy·ªán c·ªßa b·∫°n
MODEL_PATH = 'cnn_multi_aspect_model.h5'
TOKENIZER_PATH = 'tokenizer.pickle'
MAX_SEQUENCE_LENGTH = 100

ASPECT_COLUMNS = ['Price', 'Shipping', 'Outlook', 'Quality', 'Size', 'Shop_Service', 'General', 'Others']

# √Ånh x·∫° ng∆∞·ª£c t·ª´ index (0-3) v·ªÅ nh√£n g·ªëc (-1, 0, 1, 2)
label_map = {-1: 0, 0: 1, 1: 2, 2: 3}
inverse_label_map = {v: k for k, v in label_map.items()}

# Broadcast ƒë∆∞·ªùng d·∫´n file ƒë·ªÉ c√°c executor c√≥ th·ªÉ th·∫•y
sc = spark.sparkContext
broadcasted_model_path = sc.broadcast(MODEL_PATH)
broadcasted_tokenizer_path = sc.broadcast(TOKENIZER_PATH)

# Schema cho ƒë·∫ßu ra c·ªßa UDF: m·ªôt m·∫£ng ch·ª©a 8 m·∫£ng con (m·ªói m·∫£ng con 4 x√°c su·∫•t)
schema_output = ArrayType(ArrayType(FloatType()))

In [3]:
from typing import Iterator
from pandas import Series
from pyspark.sql.functions import pandas_udf
@pandas_udf(schema_output)
def predict_sentiments_udf(iterator: Iterator[Series]) -> Iterator[Series]:
    from tensorflow.keras.models import load_model
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    from pickle import load
    from re import sub
    from os import path

    model_path = broadcasted_model_path.value
    tokenizer_path = broadcasted_tokenizer_path.value
    
    if not path.exists(model_path) or not path.exists(tokenizer_path):
        raise FileNotFoundError(f"Model/Tokenizer kh√¥ng t√¨m th·∫•y tr√™n worker. ƒê·∫£m b·∫£o {model_path} v√† {tokenizer_path} c√≥ th·ªÉ truy c·∫≠p ƒë∆∞·ª£c.")
        
    model = load_model(model_path)
    with open(tokenizer_path, 'rb') as handle:
        tokenizer = load(handle)
    
    def clean_text_udf(text):
        text = str(text).lower()
        text = sub(r'[^\w\s]', '', text)
        text = sub(r'\d+', '', text)
        return text.strip()

    for comments_batch in iterator:
        cleaned_comments = comments_batch.apply(clean_text_udf)
        
        sequences = tokenizer.texts_to_sequences(cleaned_comments)
        padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
        
        if len(padded_sequences) > 0:
            predictions = model.predict(padded_sequences, verbose=0)
            result = [list(map(lambda x: x.tolist(), p)) for p in zip(*predictions)]
        else:
            result = []

        # 5. Tr·∫£ v·ªÅ batch k·∫øt qu·∫£
        yield Series(result)

In [4]:
from pyspark.sql.types import StructField, StringType, IntegerType, StructType
json_fields = [StructField("review_text", StringType())]
for aspect in ASPECT_COLUMNS:
    json_fields.append(StructField(aspect, IntegerType(), True)) # True = nullable
json_schema = StructType(json_fields)

In [5]:
KAFKA_SERVER = "kafka:9092"
TOPIC_NAME = "review_stream"

kafka_df = spark.readStream.format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_SERVER) \
    .option("subscribe", TOPIC_NAME) \
    .load()

In [6]:
from kafka import KafkaAdminClient
admin = KafkaAdminClient(bootstrap_servers="kafka:9092")
print(admin.list_topics())

['HoangPhuc-distributed-video1', 'review_stream']


In [7]:
# 2. Parse JSON v√† l·∫•y c√°c c·ªôt
from pyspark.sql.functions import col, from_json
parsed_df = kafka_df.select(
    col("value").cast("string").alias("json_value")
).select(
    from_json(col("json_value"), json_schema).alias("data")
).select("data.*") 

In [8]:
# 3. √Åp d·ª•ng Pandas UDF ƒë·ªÉ d·ª± ƒëo√°n (ch·ªâ c·∫ßn 'review_text')
predictions_df = parsed_df.withColumn(
    "predictions_prob",
    predict_sentiments_udf(col("review_text"))
)

In [9]:
from pyspark.sql.functions import udf
from numpy import argmax
# 4. Gi·∫£i n√©n d·ª± ƒëo√°n v√† gi·ªØ l·∫°i nh√£n th·∫≠t
# B·∫Øt ƒë·∫ßu v·ªõi dataframe ch·ª©a nh√£n th·∫≠t v√† x√°c su·∫•t d·ª± ƒëo√°n
result_df = predictions_df # predictions_df gi·ªù ƒë√£ ch·ª©a c·∫£ c·ªôt nh√£n th·∫≠t v√† predictions_prob

# UDF ƒë·ªÉ map ng∆∞·ª£c index (0, 1, 2, 3) v·ªÅ nh√£n (-1, 0, 1, 2)
udf_inverser = udf(lambda idx: inverse_label_map.get(idx, -99), IntegerType())

for i, aspect in enumerate(ASPECT_COLUMNS):
    # UDF ƒë·ªÉ l·∫•y index c√≥ x√°c su·∫•t cao nh·∫•t
    udf_extractor = udf(lambda prob_array: int(argmax(prob_array[i])), IntegerType())

    # L·∫•y ra index d·ª± ƒëo√°n (0-3)
    result_df = result_df.withColumn(
        f"pred_idx_{aspect}",
        udf_extractor(col("predictions_prob"))
    )
    # √Ånh x·∫° ng∆∞·ª£c index v·ªÅ nh√£n g·ªëc (-1, 0, 1, 2)
    result_df = result_df.withColumn(
        f"pred_{aspect}",
        udf_inverser(col(f"pred_idx_{aspect}"))
    ).drop(f"pred_idx_{aspect}") # X√≥a c·ªôt index trung gian

# X√≥a c·ªôt x√°c su·∫•t kh√¥ng c·∫ßn thi·∫øt n·ªØa
result_df = result_df.drop("predictions_prob")
print("ƒê√£ ƒë·ªãnh nghƒ©a lu·ªìng x·ª≠ l√Ω (bao g·ªìm nh√£n th·∫≠t).")

ƒê√£ ƒë·ªãnh nghƒ©a lu·ªìng x·ª≠ l√Ω (bao g·ªìm nh√£n th·∫≠t).


In [10]:
# Ch·ªçn c√°c c·ªôt cu·ªëi c√πng ƒë·ªÉ hi·ªÉn th·ªã
final_output_df = result_df.select(
    "review_text",
    "pred_Quality",
    "pred_Price",
    "pred_Shipping",
    "pred_Shop_Service",
    "pred_Size",
    "pred_Outlook",
    "pred_General",
    "pred_Others"
)

# Ch·∫°y stream v√† hi·ªÉn th·ªã ra console
query = final_output_df.writeStream \
    .outputMode("append") \
    .format("console") \
    .option("truncate", "false") \
    .start()

print("Query ƒë√£ b·∫Øt ƒë·∫ßu. ƒêang ch·ªù d·ªØ li·ªáu t·ª´ Kafka...")
print("Ch·∫°y cell producer 'sendStream_reviews.ipynb' ƒë·ªÉ g·ª≠i d·ªØ li·ªáu.")
print("Nh·∫•n Interrupt Kernel (n√∫t Stop) ƒë·ªÉ d·ª´ng stream.")

try:
    query.awaitTermination()
except KeyboardInterrupt:
    print("ƒêang d·ª´ng query...")
    query.stop()
    print("Query ƒë√£ d·ª´ng.")

25/10/27 03:58:52 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-40b966ac-f312-46de-8fa1-eb9cea8b0b9b. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/10/27 03:58:53 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


Query ƒë√£ b·∫Øt ƒë·∫ßu. ƒêang ch·ªù d·ªØ li·ªáu t·ª´ Kafka...
Ch·∫°y cell producer 'sendStream_reviews.ipynb' ƒë·ªÉ g·ª≠i d·ªØ li·ªáu.
Nh·∫•n Interrupt Kernel (n√∫t Stop) ƒë·ªÉ d·ª´ng stream.
-------------------------------------------
Batch: 0
-------------------------------------------
+-----------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|review_text|pred_Quality|pred_Price|pred_Shipping|pred_Shop_Service|pred_Size|pred_Outlook|pred_General|pred_Others|
+-----------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
+-----------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+



25/10/27 03:59:16 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 03:59:18 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 03:59:18 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 03:59:18 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 03:59:18 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 03:59:18 WARN KafkaDataConsumer: KafkaDataConsumer is not running in Un

-------------------------------------------
Batch: 1
-------------------------------------------
+-----------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|review_text                                                                                    |pred_Quality|pred_Price|pred_Shipping|pred_Shop_Service|pred_Size|pred_Outlook|pred_General|pred_Others|
+-----------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|N√≥i chung l√† ti·ªÅn n√†o c·ªßa l·∫•y ·∫°. ƒê·∫ø gi√†y v·ªõi ph·∫ßn v·∫£i g√≥t gi√†y kh√° b·∫©n mong shop ƒë·ªÉ √Ω          |-1          |-1        |-1           |-1               |-1       |1           |-1          |-1         |
|Sieu ∆∞ng luon                                                  

25/10/27 04:00:07 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:07 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:07 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:07 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:07 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:07 WARN KafkaDataConsumer: KafkaDataConsumer is not running in Un

-------------------------------------------
Batch: 2
-------------------------------------------
+----------------------------------------------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|review_text                                                                                                                                         |pred_Quality|pred_Price|pred_Shipping|pred_Shop_Service|pred_Size|pred_Outlook|pred_General|pred_Others|
+----------------------------------------------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|D√†y si√™u ƒë·ªáp lun √°  Tr√™n c·∫£ mong ƒë·ª£i  Shop nhi·ªát t√¨nh Ship th√¢n thi√™n x·ª©ng üíØƒë                                     

25/10/27 04:00:14 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:14 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:14 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:14 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:14 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:15 WARN KafkaDataConsumer: KafkaDataConsumer is not running in Un

-------------------------------------------
Batch: 3
-------------------------------------------
+------------------------------------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|review_text                                                                                                                               |pred_Quality|pred_Price|pred_Shipping|pred_Shop_Service|pred_Size|pred_Outlook|pred_General|pred_Others|
+------------------------------------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|Gi√†y ƒë·∫πp l·∫Øm m·ªçi ng∆∞·ªùi, gi√° v·∫≠y l√† h·ª£p l√≠ r·ªìi                                                                                             |-1      

25/10/27 04:00:19 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:19 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:19 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:19 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:19 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:19 WARN KafkaDataConsumer: KafkaDataConsumer is not running in Un

-------------------------------------------
Batch: 4
-------------------------------------------
+------------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|review_text                                                                                                       |pred_Quality|pred_Price|pred_Shipping|pred_Shop_Service|pred_Size|pred_Outlook|pred_General|pred_Others|
+------------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|tr ƒë·∫•t oiii gi√†y vs gi√° nh∆∞ nnay th√¨ kh√¥ng c√≥ ch·ªó n√†o ƒë·ªÉ ch√™ nha, m·ªói t·ªôi l√† h·ªôp b·ªã r√°ch thuiii                   |-1          |-1        |-1           |0                |-1       |-1          |-1          |-1    

25/10/27 04:00:24 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:24 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:24 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:24 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:24 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


-------------------------------------------
Batch: 5
-------------------------------------------
+----------------------------------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|review_text                                                                                                                             |pred_Quality|pred_Price|pred_Shipping|pred_Shop_Service|pred_Size|pred_Outlook|pred_General|pred_Others|
+----------------------------------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|S·∫£n ph·∫©m r·∫•t tuy·ªát v·ªùi v√† x·ªãn x√≤ ·∫°, em th√≠ch l·∫Øm ·∫° c·∫£m ∆°n shop nhi·ªÅu ·∫°a                                                                 |1       

25/10/27 04:00:34 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:34 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:34 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:34 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:34 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:34 WARN KafkaDataConsumer: KafkaDataConsumer is not running in Un

-------------------------------------------
Batch: 6
-------------------------------------------
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|review_text                                                                                                                                                                 |pred_Quality|pred_Price|pred_Shipping|pred_Shop_Service|pred_Size|pred_Outlook|pred_General|pred_Others|
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|S·∫£n ph·∫©m ƒë·∫πp ƒë√≥ng g√≥i k·ªâ c√†ng 10ƒë                  

25/10/27 04:00:39 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:39 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:39 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:40 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:40 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


-------------------------------------------
Batch: 7
-------------------------------------------
+-----------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|review_text                                                                                                      |pred_Quality|pred_Price|pred_Shipping|pred_Shop_Service|pred_Size|pred_Outlook|pred_General|pred_Others|
+-----------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|Giao h√†ng r·∫•t l√† nhanh, ƒë√≥ng g√≥i h√†ng r·∫•t c·∫©n th·∫≠n, S·∫£n ph·∫©m oke ƒëeo m·ªÅm h∆°n                                     |-1          |-1        |1            |1                |-1       |-1          |-1          |-1         |
|

25/10/27 04:00:46 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:46 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:46 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:47 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:47 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:47 WARN KafkaDataConsumer: KafkaDataConsumer is not running in Un

-------------------------------------------
Batch: 8
-------------------------------------------
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|review_text                                                                                                                                                                                                                                                                                                                      |pred_Quality|pred_Price|pred_Shipping|pred_Shop_Service|pred_Size|pred_Outlook|pred_General|pred_Others|
+----------------------------------------------

25/10/27 04:00:51 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:51 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:51 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:52 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:52 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


-------------------------------------------
Batch: 9
-------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|review_text                                                                                                                                            |pred_Quality|pred_Price|pred_Shipping|pred_Shop_Service|pred_Size|pred_Outlook|pred_General|pred_Others|
+-------------------------------------------------------------------------------------------------------------------------------------------------------+------------+----------+-------------+-----------------+---------+------------+------------+-----------+
|S·∫£n ph·∫©m n√†y r·∫•t ok                                                                                                      

25/10/27 04:00:56 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:56 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:57 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:57 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:00:57 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:01:02 ERROR Utils: Aborting task
org.apache.spark.api.python.PythonE

ƒêang d·ª´ng query...
Query ƒë√£ d·ª´ng.


In [11]:
# Ch·ªçn c√°c c·ªôt cu·ªëi c√πng ƒë·ªÉ ghi v√†o memory (bao g·ªìm nh√£n th·∫≠t v√† d·ª± ƒëo√°n)
display_columns = ["review_text"]
for aspect in ASPECT_COLUMNS:
    display_columns.append(aspect) # C·ªôt nh√£n th·∫≠t
    display_columns.append(f"pred_{aspect}") # C·ªôt d·ª± ƒëo√°n

final_output_df_mem = result_df.select(*display_columns) # D√πng *

# N·∫øu b·∫°n mu·ªën hi·ªÉn th·ªã k·∫øt qu·∫£ trong m·ªôt b·∫£ng (table) m√† b·∫°n c√≥ th·ªÉ query:
query_memory = final_output_df_mem.writeStream \
    .outputMode("append") \
    .format("memory") \
    .queryName("review_predictions_table") \
    .start()

print("Query (memory) ƒë√£ b·∫Øt ƒë·∫ßu. Ch·∫°y cell ti·∫øp theo ƒë·ªÉ xem k·∫øt qu·∫£ v√† ƒë√°nh gi√°.")

25/10/27 04:01:28 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-79dcc510-e604-4815-a39e-351e109494c9. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/10/27 04:01:28 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


Query (memory) ƒë√£ b·∫Øt ƒë·∫ßu. Ch·∫°y cell ti·∫øp theo ƒë·ªÉ xem k·∫øt qu·∫£ v√† ƒë√°nh gi√°.


25/10/27 04:01:30 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
2025-10-27 04:01:37.806548: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-10-27 04:01:37.816277: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-27 04:01:39.415947: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [12]:
from IPython.display import display, clear_output
import time

try:
    while True:
        clear_output(wait=True)
        print("ƒêang l√†m m·ªõi... (Nh·∫•n Interrupt Kernel ƒë·ªÉ d·ª´ng)")
        # Hi·ªÉn th·ªã b·∫£ng t·ª´ memory
        display(spark.sql("SELECT * FROM review_predictions_table").toPandas())
        time.sleep(5) # L√†m m·ªõi sau m·ªói 5 gi√¢y
except KeyboardInterrupt:
    print("ƒê√£ d·ª´ng hi·ªÉn th·ªã.")
    query_memory.stop()

ƒêang l√†m m·ªõi... (Nh·∫•n Interrupt Kernel ƒë·ªÉ d·ª´ng)


Unnamed: 0,review_text,Price,pred_Price,Shipping,pred_Shipping,Outlook,pred_Outlook,Quality,pred_Quality,Size,pred_Size,Shop_Service,pred_Shop_Service,General,pred_General,Others,pred_Others
0,"R·∫•t h√†i l√≤ng v·ªõi s·∫£n ph·∫©m, mu·ªën mua c√°c s·∫£n ph...",-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,1,1,-1,-1
1,n√™n mua nha mng,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,2,-1,-1,-1
2,s·∫£n ph·∫©m r·∫•t ƒë·∫πp nha mn n√™n mua d∆∞ 1 ƒë·∫øn 2 siz...,-1,-1,-1,-1,1,1,-1,-1,-1,-1,1,1,-1,-1,-1,-1
3,S·∫£n ph·∫©m ƒë·∫°t ch·∫•t l∆∞·ª£ng so v·ªõi gi√°,-1,-1,-1,-1,-1,-1,-1,1,-1,-1,-1,-1,1,-1,-1,-1
4,L·ªó x·ªè d√¢y h∆°i x·∫•u,-1,-1,-1,-1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
5,gi√†y ƒë·∫πp nh∆∞ng ch·∫≠t qu√° so v·ªõi size deo l√¢u r·∫•...,-1,-1,-1,-1,1,1,-1,-1,0,0,-1,-1,-1,-1,-1,-1
6,Q∆∞ertyuiopasdfghjklzxcvbnm1234567890-:;)&amp;@...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,2,2
7,Nxhdhdndjdndndndndndndndndndndndnnnnnnn,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,2,2
8,Gi·∫ßy ƒë·∫πp. Ck m√¨nh ƒëi v·ª´a in. ∆Øng √Ω nh√© cho sh...,-1,-1,-1,-1,1,1,-1,-1,1,1,-1,-1,-1,-1,-1,-1
9,Gi√†y ch·∫•t l∆∞·ª£ng l·∫Øm nha sƒÉn ƒë∆∞·ª£c l√™n gi√° r·∫ª l·∫Ø...,1,1,-1,-1,-1,-1,1,1,-1,-1,-1,-1,-1,-1,-1,-1


25/10/27 04:02:36 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:02:36 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/10/27 04:02:37 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


ƒê√£ d·ª´ng hi·ªÉn th·ªã.


25/10/27 04:02:37 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 10, writer: org.apache.spark.sql.execution.streaming.sources.MemoryStreamingWrite@c1c3bcd] is aborting.
25/10/27 04:02:37 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 10, writer: org.apache.spark.sql.execution.streaming.sources.MemoryStreamingWrite@c1c3bcd] aborted.
25/10/27 04:02:37 ERROR Utils: Aborting task
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/lib/python3.10/importlib/__init__.py", line 70, in invalidate_caches
    finder.invalidate_caches()
  File "<frozen importlib._bootstrap_external>", line 1338, in invalidate_caches
KeyboardInterrupt

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:94)
	at org.apache.spark.sql.execution.python.BasePytho

In [13]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import col
import pandas as pd

print("--- B·∫Øt ƒë·∫ßu ƒê√°nh gi√° M√¥ h√¨nh tr√™n D·ªØ li·ªáu ƒê√£ Thu th·∫≠p t·ª´ Stream ---")

# --- 1. ƒê·ªçc d·ªØ li·ªáu d·ª± ƒëo√°n v√† nh√£n th·∫≠t t·ª´ memory sink ---
try:
    # B·∫£ng n√†y gi·ªù ƒë√£ ch·ª©a c·∫£ nh√£n th·∫≠t (vd: 'Price') v√† d·ª± ƒëo√°n (vd: 'pred_Price')
    eval_df = spark.sql("SELECT * FROM review_predictions_table")

    if eval_df.count() == 0:
        print("Ch∆∞a c√≥ d·ªØ li·ªáu trong b·∫£ng 'review_predictions_table'. H√£y ƒë·ª£i stream ch·∫°y.")
    else:
        print(f"ƒê√£ ƒë·ªçc {eval_df.count()} b·∫£n ghi t·ª´ memory sink ƒë·ªÉ ƒë√°nh gi√°.")

        # --- 2. ƒê√°nh gi√° t·ª´ng kh√≠a c·∫°nh ---
        evaluator_f1 = MulticlassClassificationEvaluator(metricName="f1")
        evaluator_accuracy = MulticlassClassificationEvaluator(metricName="accuracy")

        print("\n--- K·∫æT QU·∫¢ ƒê√ÅNH GI√Å ---")
        total_accuracy = 0
        total_f1 = 0
        valid_aspects = 0

        for aspect in ASPECT_COLUMNS:
            true_col = aspect
            pred_col = f"pred_{aspect}"

            # Ch·ªçn c·ªôt nh√£n v√† d·ª± ƒëo√°n, ƒë·ªïi t√™n, b·ªè null
            aspect_eval_df = eval_df.select(
                col(true_col).cast("double").alias("label"),
                col(pred_col).cast("double").alias("prediction")
            ).na.drop() # R·∫•t quan tr·ªçng: B·ªè qua n·∫øu nh√£n th·∫≠t l√† null (-99 ho·∫∑c None)

            count = aspect_eval_df.count()
            if count > 0:
                f1_score = evaluator_f1.evaluate(aspect_eval_df)
                accuracy = evaluator_accuracy.evaluate(aspect_eval_df)
                print(f"Kh√≠a c·∫°nh: {aspect} ({count} b·∫£n ghi)")
                print(f"  Accuracy: {accuracy:.4f}")
                print(f"  F1-Score: {f1_score:.4f}")
                total_accuracy += accuracy
                total_f1 += f1_score
                valid_aspects += 1
            else:
                 print(f"Kh√≠a c·∫°nh: {aspect} - Kh√¥ng c√≥ d·ªØ li·ªáu h·ª£p l·ªá (non-null) ƒë·ªÉ ƒë√°nh gi√°.")

        # T√≠nh trung b√¨nh n·∫øu c√≥ kh√≠a c·∫°nh h·ª£p l·ªá
        if valid_aspects > 0:
            avg_accuracy = total_accuracy / valid_aspects
            avg_f1 = total_f1 / valid_aspects
            print("\n--- Trung b√¨nh ---")
            print(f"  Average Accuracy: {avg_accuracy:.4f}")
            print(f"  Average F1-Score: {avg_f1:.4f}")
        print("--------------------")


except Exception as e:
    print(f"L·ªói khi truy v·∫•n ho·∫∑c ƒë√°nh gi√° b·∫£ng 'review_predictions_table': {e}")
    print("H√£y ƒë·∫£m b·∫£o query ghi v√†o memory sink ƒëang ch·∫°y v√† ƒë√£ x·ª≠ l√Ω d·ªØ li·ªáu.")

print("\n--- ƒê√°nh gi√° Ho√†n t·∫•t ---")

--- B·∫Øt ƒë·∫ßu ƒê√°nh gi√° M√¥ h√¨nh tr√™n D·ªØ li·ªáu ƒê√£ Thu th·∫≠p t·ª´ Stream ---
ƒê√£ ƒë·ªçc 63 b·∫£n ghi t·ª´ memory sink ƒë·ªÉ ƒë√°nh gi√°.

--- K·∫æT QU·∫¢ ƒê√ÅNH GI√Å ---
Kh√≠a c·∫°nh: Price (63 b·∫£n ghi)
  Accuracy: 0.9048
  F1-Score: 0.8675
Kh√≠a c·∫°nh: Shipping (63 b·∫£n ghi)
  Accuracy: 0.9841
  F1-Score: 0.9920
Kh√≠a c·∫°nh: Outlook (63 b·∫£n ghi)
  Accuracy: 0.9048
  F1-Score: 0.8841
Kh√≠a c·∫°nh: Quality (63 b·∫£n ghi)
  Accuracy: 0.9048
  F1-Score: 0.9069
Kh√≠a c·∫°nh: Size (63 b·∫£n ghi)
  Accuracy: 0.8889
  F1-Score: 0.8722
Kh√≠a c·∫°nh: Shop_Service (63 b·∫£n ghi)
  Accuracy: 0.9206
  F1-Score: 0.9081
Kh√≠a c·∫°nh: General (63 b·∫£n ghi)
  Accuracy: 0.8254
  F1-Score: 0.8009
Kh√≠a c·∫°nh: Others (63 b·∫£n ghi)
  Accuracy: 0.9841
  F1-Score: 0.9833

--- Trung b√¨nh ---
  Average Accuracy: 0.9147
  Average F1-Score: 0.9019
--------------------

--- ƒê√°nh gi√° Ho√†n t·∫•t ---
