In [None]:
# docker exec -it kafka kafka-console-consumer --bootstrap-server localhost:9092 localhost:9092 --topic invoices --from-beginning

# docker exec -it kafka kafka-console-producer --bootstrap-server localhost:9092 --topic invoices

In [None]:
# mantappppppppppppppppppppppppppp
# Create the Spark Session
from pyspark.sql import SparkSession
from pyspark.sql.avro.functions import to_avro
from pyspark.sql.functions import from_json, col, expr, struct
from pyspark.sql.types import ArrayType, StructType, StructField, StringType, LongType, DoubleType, IntegerType

spark = (
    SparkSession.builder
    .appName("Streaming from Kafka") 
    .config("spark.streaming.stopGracefullyOnShutdown", True)
    .config(
        "spark.jars.packages",
        ",".join([
            "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0",
            "org.apache.spark:spark-avro_2.12:3.5.0"
        ])
    )
    .master("local[*]")
    .getOrCreate()
)





schema = StructType([
        StructField("InvoiceNumber", StringType()),
        StructField("CreatedTime", LongType()),
        StructField("StoreID", StringType()),
        StructField("PosID", StringType()),
        StructField("CashierID", StringType()),
        StructField("CustomerType", StringType()),
        StructField("CustomerCardNo", StringType()),
        StructField("TotalAmount", DoubleType()),
        StructField("NumberOfItems", IntegerType()),
        StructField("PaymentMethod", StringType()),
        StructField("CGST", DoubleType()),
        StructField("SGST", DoubleType()),
        StructField("CESS", DoubleType()),
        StructField("DeliveryType", StringType()),
        StructField("DeliveryAddress", StructType([
            StructField("AddressLine", StringType()),
            StructField("City", StringType()),
            StructField("State", StringType()),
            StructField("PinCode", StringType()),
            StructField("ContactNumber", StringType())
        ])),
        StructField("InvoiceLineItems", ArrayType(StructType([
            StructField("ItemCode", StringType()),
            StructField("ItemDescription", StringType()),
            StructField("ItemPrice", DoubleType()),
            StructField("ItemQty", IntegerType()),
            StructField("TotalValue", DoubleType())
        ]))),
])

kafka_df = spark.readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "kafka:9092") \
        .option("subscribe", "invoices") \
        .option("startingOffsets", "earliest") \
        .load()

# value_df = kafka_df.select(from_json(col("value").cast("string"), schema).alias("value"))

# notification_df = value_df.select("value.InvoiceNumber", "value.CustomerCardNo", "value.TotalAmount") \
#         .withColumn("EarnedLoyaltyPoints", expr("TotalAmount * 0.2"))

# from pyspark.sql.functions import to_json, struct

# # Membuat kolom value dalam format JSON
# output_df = notification_df.selectExpr("InvoiceNumber as key",
#                                                  """to_json(named_struct(
#                                                  'CustomerCardNo', CustomerCardNo,
#                                                  'TotalAmount', TotalAmount,
#                                                  'EarnedLoyaltyPoints', TotalAmount * 0.2)) as value""")

value_df = kafka_df.select(from_json(col("value").cast("string"), schema).alias("value"))

explode_df = value_df.selectExpr("value.InvoiceNumber", "value.CreatedTime", "value.StoreID",
                                     "value.PosID", "value.CustomerType", "value.CustomerCardNo", "value.DeliveryType",
                                     "value.DeliveryAddress.City",
                                     "value.DeliveryAddress.State", "value.DeliveryAddress.PinCode",
                                     "explode(value.InvoiceLineItems) as LineItem")

flattened_df = explode_df \
        .withColumn("ItemCode", expr("LineItem.ItemCode")) \
        .withColumn("ItemDescription", expr("LineItem.ItemDescription")) \
        .withColumn("ItemPrice", expr("LineItem.ItemPrice")) \
        .withColumn("ItemQty", expr("LineItem.ItemQty")) \
        .withColumn("TotalValue", expr("LineItem.TotalValue")) \
        .drop("LineItem")

kafka_target_df = flattened_df.select(expr("InvoiceNumber as key"),
                                          to_avro(struct("*")).alias("value"))

# Menulis ke Kafka (batch)
query = kafka_target_df.writeStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("checkpointLocation", "chk-point-dir") \
    .option("topic", "invoice-items") \
    .start()

query.awaitTermination()